In [2]:
# !wget https://pages.cs.wisc.edu/~harter/cs544/data/hdma-wi-2021.zip
# !unzip hdma-wi-2021.zip

In [3]:
import pyarrow as pa
import pyarrow.csv
import pyarrow.parquet

In [10]:
%%time
t = pa.csv.read_csv("hdma-wi-2021.csv")

CPU times: user 1.09 s, sys: 810 ms, total: 1.9 s
Wall time: 506 ms


In [6]:
pa.parquet.write_table(t, "hdma-wi-2021.parquet")

In [7]:
# point 1: we don't need to do slow schema inference with parquet

In [11]:
%%time
t = pa.parquet.read_table("hdma-wi-2021.parquet")

CPU times: user 400 ms, sys: 118 ms, total: 517 ms
Wall time: 157 ms


In [12]:
# point 2: parquet uses a binary encoding

In [13]:
with open("hdma-wi-2021.csv", "rb") as f:
 print(f.read(100))

b'activity_year,lei,derived_msa-md,state_code,county_code,census_tract,conforming_loan_limit,derived_l'


In [14]:
with open("hdma-wi-2021.parquet", "rb") as f:
 print(f.read(100))

b'PAR1\x15\x04\x15\x10\x15\x14L\x15\x02\x15\x00\x12\x00\x00\x08\x1c\xe5\x07\x00\x00\x00\x00\x00\x00\x15\x00\x15\x1a\x15\x1e,\x15\x8e\xce6\x15\x10\x15\x06\x15\x06\x1c\x18\x08\xe5\x07\x00\x00\x00\x00\x00\x00\x18\x08\xe5\x07\x00\x00\x00\x00\x00\x00\x16\x00(\x08\xe5\x07\x00\x00\x00\x00\x00\x00\x18\x08\xe5\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\r0\x04\x00\x00\x00\x8e\xce6'


In [15]:
# point 3: parquet is column oriented

In [17]:
%%time
t2 = pa.parquet.read_table("hdma-wi-2021.parquet", columns=["lei", "census_tract"])

CPU times: user 26.8 ms, sys: 4.41 ms, total: 31.2 ms
Wall time: 22.3 ms


In [16]:
t

pyarrow.Table
activity_year: int64
lei: string
derived_msa-md: int64
state_code: string
county_code: int64
census_tract: int64
conforming_loan_limit: string
derived_loan_product_type: string
derived_dwelling_category: string
derived_ethnicity: string
derived_race: string
derived_sex: string
action_taken: int64
purchaser_type: int64
preapproval: int64
loan_type: int64
loan_purpose: int64
lien_status: int64
reverse_mortgage: int64
open-end_line_of_credit: int64
business_or_commercial_purpose: int64
loan_amount: double
loan_to_value_ratio: string
interest_rate: string
rate_spread: string
hoepa_status: int64
total_loan_costs: string
total_points_and_fees: string
origination_charges: string
discount_points: string
lender_credits: string
loan_term: string
prepayment_penalty_term: string
intro_rate_period: string
negative_amortization: int64
interest_only_payment: int64
balloon_payment: int64
other_nonamortizing_features: int64
property_value: string
construction_method: int64
occupancy_type:

In [18]:
# point 4: Parquet files are compressed with snappy by default

In [19]:
!ls -lh

total 204M
-rw-r----- 1 tharter tharter 167M Nov 1 2022 hdma-wi-2021.csv
-rw-rw-r-- 1 tharter tharter 16M Feb 24 11:08 hdma-wi-2021.parquet
-rw-rw-r-- 1 tharter tharter 21M Jan 5 2023 hdma-wi-2021.zip
-rw-rw-r-- 1 tharter tharter 18K Feb 24 10:11 lec1.ipynb
-rw-rw-r-- 1 tharter tharter 16K Feb 24 11:14 lec2.ipynb


In [23]:
%%time
pa.parquet.write_table(t, "hdma-wi-2021.parquet", compression="snappy")

CPU times: user 716 ms, sys: 24.2 ms, total: 740 ms
Wall time: 754 ms


In [24]:
%%time
pa.parquet.write_table(t, "hdma-wi-2021-gzip.parquet", compression="gzip")

CPU times: user 2.15 s, sys: 15.7 ms, total: 2.17 s
Wall time: 2.17 s


In [25]:
!ls -lh

total 216M
-rw-r----- 1 tharter tharter 167M Nov 1 2022 hdma-wi-2021.csv
-rw-rw-r-- 1 tharter tharter 13M Feb 24 11:15 hdma-wi-2021-gzip.parquet
-rw-rw-r-- 1 tharter tharter 16M Feb 24 11:15 hdma-wi-2021.parquet
-rw-rw-r-- 1 tharter tharter 21M Jan 5 2023 hdma-wi-2021.zip
-rw-rw-r-- 1 tharter tharter 18K Feb 24 10:11 lec1.ipynb
-rw-rw-r-- 1 tharter tharter 16K Feb 24 11:14 lec2.ipynb
