Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • cdis/cs/courses/cs544/s25/main
  • zzhang2478/main
  • spark667/main
  • vijayprabhak/main
  • vijayprabhak/544-main
  • wyang338/cs-544-s-25
  • jmin39/main
7 results
Show changes
Commits on Source (53)
Showing
with 8058 additions and 0 deletions
File added
File added
File added
File added
File added
File added
File added
File added
%% Cell type:code id:2dbe2890-3b90-4978-9648-625dc8d7a949 tags:
``` python
# !wget https://pages.cs.wisc.edu/~harter/cs544/data/hdma-wi-2021.zip
# !unzip hdma-wi-2021.zip
```
%% Cell type:code id:3a4cfa35-6b4e-48bc-acc6-09d426cd6c3e tags:
``` python
import pyarrow as pa
import pyarrow.csv
import pyarrow.parquet
```
%% Cell type:code id:e5acbdd4-a266-477a-b720-a68e16e6c8f6 tags:
``` python
%%time
t = pa.csv.read_csv("hdma-wi-2021.csv")
```
%% Output
CPU times: user 1.18 s, sys: 996 ms, total: 2.18 s
Wall time: 575 ms
%% Cell type:code id:dd5941ac-fc66-4037-81f8-4f32b5b9e4bc tags:
``` python
pa.parquet.write_table(t, "hdma-wi-2021.parquet")
```
%% Cell type:code id:ed698328-e057-41a5-89d9-351166331a6e tags:
``` python
# point 1: Parquet lets us skip slow schema inference
```
%% Cell type:code id:3c3b8e74-d1d6-412e-9dc9-f6f1cc252ce7 tags:
``` python
%%time
t = pa.parquet.read_table("hdma-wi-2021.parquet")
```
%% Output
CPU times: user 396 ms, sys: 102 ms, total: 498 ms
Wall time: 147 ms
%% Cell type:code id:d582cd7a-27ed-4c09-a22e-8663bd24e231 tags:
``` python
# point 2: Parquet is byte encoded
```
%% Cell type:code id:9e2d33ce-3a07-405a-b379-ef86e6e0e076 tags:
``` python
with open("hdma-wi-2021.csv", "rb") as f:
print(f.read(100))
```
%% Output
b'activity_year,lei,derived_msa-md,state_code,county_code,census_tract,conforming_loan_limit,derived_l'
%% Cell type:code id:76debfbf-a7ca-48f6-8ca0-c9a00fbd3dd9 tags:
``` python
with open("hdma-wi-2021.parquet", "rb") as f:
print(f.read(100))
```
%% Output
b'PAR1\x15\x04\x15\x10\x15\x14L\x15\x02\x15\x00\x12\x00\x00\x08\x1c\xe5\x07\x00\x00\x00\x00\x00\x00\x15\x00\x15\x1a\x15\x1e,\x15\x8e\xce6\x15\x10\x15\x06\x15\x06\x1c\x18\x08\xe5\x07\x00\x00\x00\x00\x00\x00\x18\x08\xe5\x07\x00\x00\x00\x00\x00\x00\x16\x00(\x08\xe5\x07\x00\x00\x00\x00\x00\x00\x18\x08\xe5\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\r0\x04\x00\x00\x00\x8e\xce6'
%% Cell type:code id:d891d58c-8d8b-4dc9-bb2f-d12a03db465e tags:
``` python
# point 3: Parquet files are column oriented
```
%% Cell type:code id:3159c561-122d-46af-96fb-0093368f6086 tags:
``` python
%%time
t2 = pa.parquet.read_table("hdma-wi-2021.parquet", columns=["lei", "census_tract"])
```
%% Output
CPU times: user 26.3 ms, sys: 2.61 ms, total: 29 ms
Wall time: 20.8 ms
%% Cell type:code id:51ad1ba1-9ea7-4188-b96d-aaa1847f3e95 tags:
``` python
t
```
%% Output
pyarrow.Table
activity_year: int64
lei: string
derived_msa-md: int64
state_code: string
county_code: int64
census_tract: int64
conforming_loan_limit: string
derived_loan_product_type: string
derived_dwelling_category: string
derived_ethnicity: string
derived_race: string
derived_sex: string
action_taken: int64
purchaser_type: int64
preapproval: int64
loan_type: int64
loan_purpose: int64
lien_status: int64
reverse_mortgage: int64
open-end_line_of_credit: int64
business_or_commercial_purpose: int64
loan_amount: double
loan_to_value_ratio: string
interest_rate: string
rate_spread: string
hoepa_status: int64
total_loan_costs: string
total_points_and_fees: string
origination_charges: string
discount_points: string
lender_credits: string
loan_term: string
prepayment_penalty_term: string
intro_rate_period: string
negative_amortization: int64
interest_only_payment: int64
balloon_payment: int64
other_nonamortizing_features: int64
property_value: string
construction_method: int64
occupancy_type: int64
manufactured_home_secured_property_type: int64
manufactured_home_land_property_interest: int64
total_units: string
multifamily_affordable_units: string
income: int64
debt_to_income_ratio: string
applicant_credit_score_type: int64
co-applicant_credit_score_type: int64
applicant_ethnicity-1: int64
applicant_ethnicity-2: int64
applicant_ethnicity-3: int64
applicant_ethnicity-4: int64
applicant_ethnicity-5: int64
co-applicant_ethnicity-1: int64
co-applicant_ethnicity-2: int64
co-applicant_ethnicity-3: int64
co-applicant_ethnicity-4: int64
co-applicant_ethnicity-5: null
applicant_ethnicity_observed: int64
co-applicant_ethnicity_observed: int64
applicant_race-1: int64
applicant_race-2: int64
applicant_race-3: int64
applicant_race-4: int64
applicant_race-5: int64
co-applicant_race-1: int64
co-applicant_race-2: int64
co-applicant_race-3: int64
co-applicant_race-4: int64
co-applicant_race-5: int64
applicant_race_observed: int64
co-applicant_race_observed: int64
applicant_sex: int64
co-applicant_sex: int64
applicant_sex_observed: int64
co-applicant_sex_observed: int64
applicant_age: string
co-applicant_age: string
applicant_age_above_62: string
co-applicant_age_above_62: string
submission_of_application: int64
initially_payable_to_institution: int64
aus-1: int64
aus-2: int64
aus-3: int64
aus-4: int64
aus-5: int64
denial_reason-1: int64
denial_reason-2: int64
denial_reason-3: int64
denial_reason-4: int64
tract_population: int64
tract_minority_population_percent: double
ffiec_msa_md_median_family_income: int64
tract_to_msa_income_percentage: int64
tract_owner_occupied_units: int64
tract_one_to_four_family_homes: int64
tract_median_age_of_housing_units: int64
----
activity_year: [[2021,2021,2021,2021,2021,...,2021,2021,2021,2021,2021],[2021,2021,2021,2021,2021,...,2021,2021,2021,2021,2021],[2021,2021,2021,2021,2021,...,2021,2021,2021,2021,2021],[2021,2021,2021,2021,2021,...,2021,2021,2021,2021,2021]]
lei: [["54930034MNPILHP25H80","54930034MNPILHP25H80","54930034MNPILHP25H80","54930034MNPILHP25H80","54930034MNPILHP25H80",...,"254900X6OAHFW6BUT219","254900X6OAHFW6BUT219","254900X6OAHFW6BUT219","254900X6OAHFW6BUT219","254900X6OAHFW6BUT219"],["254900X6OAHFW6BUT219","254900X6OAHFW6BUT219","254900X6OAHFW6BUT219","254900X6OAHFW6BUT219","254900X6OAHFW6BUT219",...,"549300KY533JFETOYG46","549300KY533JFETOYG46","549300KY533JFETOYG46","549300KY533JFETOYG46","549300KY533JFETOYG46"],["549300KY533JFETOYG46","549300KY533JFETOYG46","549300KY533JFETOYG46","549300KY533JFETOYG46","549300KY533JFETOYG46",...,"ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18"],["ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18",...,"54930034MNPILHP25H80","54930034MNPILHP25H80","54930034MNPILHP25H80","54930034MNPILHP25H80","54930034MNPILHP25H80"]]
derived_msa-md: [[99999,99999,99999,29404,11540,...,33460,20740,33460,33460,99999],[99999,33460,33460,33460,20740,...,99999,33340,33340,33340,33340],[99999,33340,39540,33340,39540,...,36780,36780,11540,33340,33340],[29100,31540,99999,22540,99999,...,31540,99999,31540,99999,31540]]
state_code: [["WI","WI","WI","WI","WI",...,"WI","WI","WI","WI","WI"],["WI","WI","WI","WI","WI",...,"WI","WI","WI","WI","WI"],["WI","WI","WI","WI","WI",...,"WI","WI","WI","WI","WI"],["WI","WI","WI","WI","WI",...,"WI","WI","WI","WI","WI"]]
county_code: [[55027,55001,55013,55059,55087,...,55109,55017,55093,55109,55033],[55095,55109,55109,55109,55017,...,55027,55079,55133,55133,55079],[55027,55133,55101,55079,55101,...,55139,55139,55087,55131,55079],[55063,55021,55011,55039,55097,...,55025,55029,55025,55051,55021]]
census_tract: [[55027961800,55001950501,55013970400,55059002000,55087013300,...,55109121000,55017011100,55093960700,55109120904,55033970400],[55095960500,55109120700,55109121000,55109120904,55017010700,...,55027961500,55079090300,55133203305,55133203406,55079000303],[55027960800,55133201600,55101000901,55079016100,55101002402,...,55139001100,55139001803,55087012100,55131450104,55079150301],[55063010201,55021970100,55011960400,55039041300,55097960600,...,55025011301,55029100800,55025012300,55051180300,55021970300]]
conforming_loan_limit: [["C","C","C","C","C",...,"C","C","C","C","C"],["C","C","C","C","C",...,"C","C","C","C","C"],["C","C","C","C","C",...,"C","C","C","C","C"],["C","C","C","C","C",...,"C","C","C","C","C"]]
derived_loan_product_type: [["Conventional:First Lien","Conventional:First Lien","Conventional:First Lien","Conventional:First Lien","Conventional:First Lien",...,"Conventional:First Lien","Conventional:First Lien","FSA/RHS:First Lien","Conventional:Subordinate Lien","Conventional:First Lien"],["Conventional:First Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien",...,"Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:First Lien"],["Conventional:First Lien","Conventional:First Lien","Conventional:First Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien",...,"Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien"],["Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:First Lien",...,"Conventional:First Lien","Conventional:First Lien","Conventional:First Lien","Conventional:First Lien","Conventional:First Lien"]]
derived_dwelling_category: [["Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built",...,"Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built"],["Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built",...,"Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built"],["Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built",...,"Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built"],["Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built",...,"Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built"]]
derived_ethnicity: [["Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Joint",...,"Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino"],["Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Ethnicity Not Available","Not Hispanic or Latino",...,"Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Hispanic or Latino"],["Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Hispanic or Latino","Hispanic or Latino",...,"Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino"],["Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Ethnicity Not Available",...,"Ethnicity Not Available","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino"]]
...
%% Cell type:code id:28a0a2d6-95a1-4c86-9577-ef62721c19d6 tags:
``` python
# point 4: Parquet is compressed, with snappy by default
```
%% Cell type:code id:9ee9812e-76d9-4bf3-9194-4556f83bf0ff tags:
``` python
%%time
pa.parquet.write_table(t, "hdma-wi-2021.parquet", compression="snappy")
```
%% Output
CPU times: user 698 ms, sys: 18.6 ms, total: 717 ms
Wall time: 730 ms
%% Cell type:code id:487c9b6f-559c-4777-b498-b4f41cd8667a tags:
``` python
%%time
pa.parquet.write_table(t, "hdma-wi-2021-gzip.parquet", compression="gzip")
```
%% Output
CPU times: user 2.21 s, sys: 13.9 ms, total: 2.22 s
Wall time: 2.22 s
%% Cell type:code id:f2c3e5a3-7bf2-4f92-8019-e13b8cc28c28 tags:
``` python
!ls -lh
```
%% Output
total 216M
-rw-rw-r-- 1 tharter tharter 333 Feb 20 15:45 Dockerfile
-rw-r----- 1 tharter tharter 167M Nov 1 2022 hdma-wi-2021.csv
-rw-rw-r-- 1 tharter tharter 13M Feb 24 09:04 hdma-wi-2021-gzip.parquet
-rw-rw-r-- 1 tharter tharter 16M Feb 24 09:03 hdma-wi-2021.parquet
-rw-rw-r-- 1 tharter tharter 21M Jan 5 2023 hdma-wi-2021.zip
-rw-rw-r-- 1 tharter tharter 17K Feb 24 09:03 lec1.ipynb
drwxrwxr-x 3 tharter tharter 4.0K Feb 21 09:26 old
-rw-rw-r-- 1 tharter tharter 1.8K Feb 20 15:45 requirements.txt
drwxrwxr-x 3 tharter tharter 4.0K Feb 21 11:39 shared
%% Cell type:code id:92459058-9a4b-4da7-8820-65a6ebb70801 tags:
``` python
```
%% Cell type:code id:803ee3d0-11db-4b3c-901f-c09969c42a99 tags:
``` python
# !wget https://pages.cs.wisc.edu/~harter/cs544/data/hdma-wi-2021.zip
# !unzip hdma-wi-2021.zip
```
%% Cell type:code id:dc2864a3-8f39-440e-9cc1-a88e6fcf764b tags:
``` python
import pyarrow as pa
import pyarrow.csv
import pyarrow.parquet
```
%% Cell type:code id:ddb42c11-e332-42cd-be7f-4ce2ce74ed6b tags:
``` python
%%time
t = pa.csv.read_csv("hdma-wi-2021.csv")
```
%% Output
CPU times: user 1.09 s, sys: 810 ms, total: 1.9 s
Wall time: 506 ms
%% Cell type:code id:a582a756-1c75-4869-8f27-b849320213e1 tags:
``` python
pa.parquet.write_table(t, "hdma-wi-2021.parquet")
```
%% Cell type:code id:80d2c0a6-9c6e-475d-a56b-99a4e7fbb05d tags:
``` python
# point 1: we don't need to do slow schema inference with parquet
```
%% Cell type:code id:c2f34f52-60c6-48c5-b29f-4f6131fb593e tags:
``` python
%%time
t = pa.parquet.read_table("hdma-wi-2021.parquet")
```
%% Output
CPU times: user 400 ms, sys: 118 ms, total: 517 ms
Wall time: 157 ms
%% Cell type:code id:99005a4b-4eb8-4970-a1cd-65422bb6dad6 tags:
``` python
# point 2: parquet uses a binary encoding
```
%% Cell type:code id:abc45de9-6b4d-4b4e-a2ff-127a69a9e3c0 tags:
``` python
with open("hdma-wi-2021.csv", "rb") as f:
print(f.read(100))
```
%% Output
b'activity_year,lei,derived_msa-md,state_code,county_code,census_tract,conforming_loan_limit,derived_l'
%% Cell type:code id:e642e241-078c-45ba-8cf0-6a7587720e17 tags:
``` python
with open("hdma-wi-2021.parquet", "rb") as f:
print(f.read(100))
```
%% Output
b'PAR1\x15\x04\x15\x10\x15\x14L\x15\x02\x15\x00\x12\x00\x00\x08\x1c\xe5\x07\x00\x00\x00\x00\x00\x00\x15\x00\x15\x1a\x15\x1e,\x15\x8e\xce6\x15\x10\x15\x06\x15\x06\x1c\x18\x08\xe5\x07\x00\x00\x00\x00\x00\x00\x18\x08\xe5\x07\x00\x00\x00\x00\x00\x00\x16\x00(\x08\xe5\x07\x00\x00\x00\x00\x00\x00\x18\x08\xe5\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\r0\x04\x00\x00\x00\x8e\xce6'
%% Cell type:code id:912485a1-23c5-49cd-bd75-16bd860f8d3a tags:
``` python
# point 3: parquet is column oriented
```
%% Cell type:code id:4095de52-6859-440c-b614-cdeb9c28ebe6 tags:
``` python
%%time
t2 = pa.parquet.read_table("hdma-wi-2021.parquet", columns=["lei", "census_tract"])
```
%% Output
CPU times: user 26.8 ms, sys: 4.41 ms, total: 31.2 ms
Wall time: 22.3 ms
%% Cell type:code id:d277e063-d31c-495d-8fe1-92883447c451 tags:
``` python
t
```
%% Output
pyarrow.Table
activity_year: int64
lei: string
derived_msa-md: int64
state_code: string
county_code: int64
census_tract: int64
conforming_loan_limit: string
derived_loan_product_type: string
derived_dwelling_category: string
derived_ethnicity: string
derived_race: string
derived_sex: string
action_taken: int64
purchaser_type: int64
preapproval: int64
loan_type: int64
loan_purpose: int64
lien_status: int64
reverse_mortgage: int64
open-end_line_of_credit: int64
business_or_commercial_purpose: int64
loan_amount: double
loan_to_value_ratio: string
interest_rate: string
rate_spread: string
hoepa_status: int64
total_loan_costs: string
total_points_and_fees: string
origination_charges: string
discount_points: string
lender_credits: string
loan_term: string
prepayment_penalty_term: string
intro_rate_period: string
negative_amortization: int64
interest_only_payment: int64
balloon_payment: int64
other_nonamortizing_features: int64
property_value: string
construction_method: int64
occupancy_type: int64
manufactured_home_secured_property_type: int64
manufactured_home_land_property_interest: int64
total_units: string
multifamily_affordable_units: string
income: int64
debt_to_income_ratio: string
applicant_credit_score_type: int64
co-applicant_credit_score_type: int64
applicant_ethnicity-1: int64
applicant_ethnicity-2: int64
applicant_ethnicity-3: int64
applicant_ethnicity-4: int64
applicant_ethnicity-5: int64
co-applicant_ethnicity-1: int64
co-applicant_ethnicity-2: int64
co-applicant_ethnicity-3: int64
co-applicant_ethnicity-4: int64
co-applicant_ethnicity-5: null
applicant_ethnicity_observed: int64
co-applicant_ethnicity_observed: int64
applicant_race-1: int64
applicant_race-2: int64
applicant_race-3: int64
applicant_race-4: int64
applicant_race-5: int64
co-applicant_race-1: int64
co-applicant_race-2: int64
co-applicant_race-3: int64
co-applicant_race-4: int64
co-applicant_race-5: int64
applicant_race_observed: int64
co-applicant_race_observed: int64
applicant_sex: int64
co-applicant_sex: int64
applicant_sex_observed: int64
co-applicant_sex_observed: int64
applicant_age: string
co-applicant_age: string
applicant_age_above_62: string
co-applicant_age_above_62: string
submission_of_application: int64
initially_payable_to_institution: int64
aus-1: int64
aus-2: int64
aus-3: int64
aus-4: int64
aus-5: int64
denial_reason-1: int64
denial_reason-2: int64
denial_reason-3: int64
denial_reason-4: int64
tract_population: int64
tract_minority_population_percent: double
ffiec_msa_md_median_family_income: int64
tract_to_msa_income_percentage: int64
tract_owner_occupied_units: int64
tract_one_to_four_family_homes: int64
tract_median_age_of_housing_units: int64
----
activity_year: [[2021,2021,2021,2021,2021,...,2021,2021,2021,2021,2021],[2021,2021,2021,2021,2021,...,2021,2021,2021,2021,2021],[2021,2021,2021,2021,2021,...,2021,2021,2021,2021,2021],[2021,2021,2021,2021,2021,...,2021,2021,2021,2021,2021]]
lei: [["54930034MNPILHP25H80","54930034MNPILHP25H80","54930034MNPILHP25H80","54930034MNPILHP25H80","54930034MNPILHP25H80",...,"254900X6OAHFW6BUT219","254900X6OAHFW6BUT219","254900X6OAHFW6BUT219","254900X6OAHFW6BUT219","254900X6OAHFW6BUT219"],["254900X6OAHFW6BUT219","254900X6OAHFW6BUT219","254900X6OAHFW6BUT219","254900X6OAHFW6BUT219","254900X6OAHFW6BUT219",...,"549300KY533JFETOYG46","549300KY533JFETOYG46","549300KY533JFETOYG46","549300KY533JFETOYG46","549300KY533JFETOYG46"],["549300KY533JFETOYG46","549300KY533JFETOYG46","549300KY533JFETOYG46","549300KY533JFETOYG46","549300KY533JFETOYG46",...,"ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18"],["ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18","ZF85QS7OXKPBG52R7N18",...,"54930034MNPILHP25H80","54930034MNPILHP25H80","54930034MNPILHP25H80","54930034MNPILHP25H80","54930034MNPILHP25H80"]]
derived_msa-md: [[99999,99999,99999,29404,11540,...,33460,20740,33460,33460,99999],[99999,33460,33460,33460,20740,...,99999,33340,33340,33340,33340],[99999,33340,39540,33340,39540,...,36780,36780,11540,33340,33340],[29100,31540,99999,22540,99999,...,31540,99999,31540,99999,31540]]
state_code: [["WI","WI","WI","WI","WI",...,"WI","WI","WI","WI","WI"],["WI","WI","WI","WI","WI",...,"WI","WI","WI","WI","WI"],["WI","WI","WI","WI","WI",...,"WI","WI","WI","WI","WI"],["WI","WI","WI","WI","WI",...,"WI","WI","WI","WI","WI"]]
county_code: [[55027,55001,55013,55059,55087,...,55109,55017,55093,55109,55033],[55095,55109,55109,55109,55017,...,55027,55079,55133,55133,55079],[55027,55133,55101,55079,55101,...,55139,55139,55087,55131,55079],[55063,55021,55011,55039,55097,...,55025,55029,55025,55051,55021]]
census_tract: [[55027961800,55001950501,55013970400,55059002000,55087013300,...,55109121000,55017011100,55093960700,55109120904,55033970400],[55095960500,55109120700,55109121000,55109120904,55017010700,...,55027961500,55079090300,55133203305,55133203406,55079000303],[55027960800,55133201600,55101000901,55079016100,55101002402,...,55139001100,55139001803,55087012100,55131450104,55079150301],[55063010201,55021970100,55011960400,55039041300,55097960600,...,55025011301,55029100800,55025012300,55051180300,55021970300]]
conforming_loan_limit: [["C","C","C","C","C",...,"C","C","C","C","C"],["C","C","C","C","C",...,"C","C","C","C","C"],["C","C","C","C","C",...,"C","C","C","C","C"],["C","C","C","C","C",...,"C","C","C","C","C"]]
derived_loan_product_type: [["Conventional:First Lien","Conventional:First Lien","Conventional:First Lien","Conventional:First Lien","Conventional:First Lien",...,"Conventional:First Lien","Conventional:First Lien","FSA/RHS:First Lien","Conventional:Subordinate Lien","Conventional:First Lien"],["Conventional:First Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien",...,"Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:First Lien"],["Conventional:First Lien","Conventional:First Lien","Conventional:First Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien",...,"Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien"],["Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:Subordinate Lien","Conventional:First Lien",...,"Conventional:First Lien","Conventional:First Lien","Conventional:First Lien","Conventional:First Lien","Conventional:First Lien"]]
derived_dwelling_category: [["Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built",...,"Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built"],["Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built",...,"Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built"],["Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built",...,"Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built"],["Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built",...,"Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built","Single Family (1-4 Units):Site-Built"]]
derived_ethnicity: [["Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Joint",...,"Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino"],["Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Ethnicity Not Available","Not Hispanic or Latino",...,"Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Hispanic or Latino"],["Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Hispanic or Latino","Hispanic or Latino",...,"Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino"],["Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Ethnicity Not Available",...,"Ethnicity Not Available","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino","Not Hispanic or Latino"]]
...
%% Cell type:code id:41baa49f-51dd-4c6f-9c32-33ba44f4de83 tags:
``` python
# point 4: Parquet files are compressed with snappy by default
```
%% Cell type:code id:5a043407-2a33-4c0c-8adf-bef72156d3ca tags:
``` python
!ls -lh
```
%% Output
total 204M
-rw-r----- 1 tharter tharter 167M Nov 1 2022 hdma-wi-2021.csv
-rw-rw-r-- 1 tharter tharter 16M Feb 24 11:08 hdma-wi-2021.parquet
-rw-rw-r-- 1 tharter tharter 21M Jan 5 2023 hdma-wi-2021.zip
-rw-rw-r-- 1 tharter tharter 18K Feb 24 10:11 lec1.ipynb
-rw-rw-r-- 1 tharter tharter 16K Feb 24 11:14 lec2.ipynb
%% Cell type:code id:2ff0caa9-6f57-45b0-8ff4-910eb0ad2359 tags:
``` python
%%time
pa.parquet.write_table(t, "hdma-wi-2021.parquet", compression="snappy")
```
%% Output
CPU times: user 716 ms, sys: 24.2 ms, total: 740 ms
Wall time: 754 ms
%% Cell type:code id:ea976df1-d258-4446-a277-9a9d750bfc49 tags:
``` python
%%time
pa.parquet.write_table(t, "hdma-wi-2021-gzip.parquet", compression="gzip")
```
%% Output
CPU times: user 2.15 s, sys: 15.7 ms, total: 2.17 s
Wall time: 2.17 s
%% Cell type:code id:c587e2e5-2167-4663-ab5c-5df51f3f9937 tags:
``` python
!ls -lh
```
%% Output
total 216M
-rw-r----- 1 tharter tharter 167M Nov 1 2022 hdma-wi-2021.csv
-rw-rw-r-- 1 tharter tharter 13M Feb 24 11:15 hdma-wi-2021-gzip.parquet
-rw-rw-r-- 1 tharter tharter 16M Feb 24 11:15 hdma-wi-2021.parquet
-rw-rw-r-- 1 tharter tharter 21M Jan 5 2023 hdma-wi-2021.zip
-rw-rw-r-- 1 tharter tharter 18K Feb 24 10:11 lec1.ipynb
-rw-rw-r-- 1 tharter tharter 16K Feb 24 11:14 lec2.ipynb
%% Cell type:code id:e525a734-ac33-40b4-a8c1-dc4c30b06b02 tags:
``` python
```
This diff is collapsed.
This diff is collapsed.
services:
hdfs:
image: p4-hdfs
hostname: main
ports:
- "127.0.0.1:9870:9870"
deploy:
resources:
limits:
memory: 2g
command: sleep infinity
nb:
image: p4-nb
ports:
- "127.0.0.1:5000:5000"
volumes:
- "./nb:/nb"
deploy:
resources:
limits:
memory: 2g
FROM ubuntu:24.04
RUN apt-get update; apt-get install -y wget curl openjdk-11-jdk python3-pip iproute2 nano
# HDFS
RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz; tar -xf hadoop-3.3.6.tar.gz; rm hadoop-3.3.6.tar.gz
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
ENV PATH="${PATH}:/hadoop-3.3.6/bin"
ENV HADOOP_HOME=/hadoop-3.3.6
This diff is collapsed.
This diff is collapsed.
FROM p4-hdfs
RUN pip3 install jupyterlab==4.0.3 jupyter-client==8.4.0 pyarrow==17.0.0 pandas==2.2.3 requests==2.31.0 nbconvert==7.9.2 --break-system-packages
CMD export CLASSPATH=`$HADOOP_HOME/bin/hdfs classpath --glob` && \
python3 -m jupyterlab --no-browser --ip=0.0.0.0 --port=5000 --allow-root --NotebookApp.token=''
FROM ubuntu:24.04
RUN apt-get update; apt-get install -y wget curl openjdk-11-jdk python3-pip nano
# SPARK
RUN wget https://archive.apache.org/dist/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz && tar -xf spark-3.5.5-bin-hadoop3.tgz && rm spark-3.5.5-bin-hadoop3.tgz
# HDFS
RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz && tar -xf hadoop-3.3.6.tar.gz && rm hadoop-3.3.6.tar.gz
# Jupyter
RUN pip3 install jupyterlab==4.3.5 pandas==2.2.3 pyspark==3.5.5 --break-system-packages
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
ENV PATH="${PATH}:/hadoop-3.3.6/bin"
ENV HADOOP_HOME=/hadoop-3.3.6
services:
nb:
image: spark-demo
ports:
- "127.0.0.1:5000:5000"
- "127.0.0.1:4040:4040"
volumes:
- "./nb:/nb"
command: python3 -m jupyterlab --no-browser --ip=0.0.0.0 --port=5000 --allow-root --NotebookApp.token=''
nn:
image: spark-demo
hostname: nn
command: sh -c "hdfs namenode -format -force && hdfs namenode -D dfs.replication=1 -fs hdfs://nn:9000"
dn:
image: spark-demo
command: hdfs datanode -fs hdfs://nn:9000
spark-boss:
image: spark-demo
hostname: boss
command: sh -c "/spark-3.5.5-bin-hadoop3/sbin/start-master.sh && sleep infinity"
spark-worker:
image: spark-demo
command: sh -c "/spark-3.5.5-bin-hadoop3/sbin/start-worker.sh spark://boss:7077 -c 2 -m 2g && sleep infinity"
deploy:
replicas: 2
This diff is collapsed.