|
| 1 | +import sys |
| 2 | + |
| 3 | +from new_etl.data_utils.access_process import access_process |
| 4 | +from new_etl.data_utils.contig_neighbors import contig_neighbors |
| 5 | +from new_etl.data_utils.dev_probability import dev_probability |
| 6 | +from new_etl.data_utils.negligent_devs import negligent_devs |
| 7 | +from new_etl.data_utils.opa_properties import opa_properties |
| 8 | +from new_etl.data_utils.priority_level import priority_level |
| 9 | +from new_etl.data_utils.vacant_properties import vacant_properties |
| 10 | +from new_etl.data_utils.pwd_parcels import pwd_parcels |
| 11 | +from new_etl.data_utils.city_owned_properties import city_owned_properties |
| 12 | +from new_etl.data_utils.phs_properties import phs_properties |
| 13 | +from new_etl.data_utils.li_violations import li_violations |
| 14 | +from new_etl.data_utils.li_complaints import li_complaints |
| 15 | +from new_etl.data_utils.rco_geoms import rco_geoms |
| 16 | +from new_etl.data_utils.council_dists import council_dists |
| 17 | +from new_etl.data_utils.tree_canopy import tree_canopy |
| 18 | +from new_etl.data_utils.nbhoods import nbhoods |
| 19 | +from new_etl.data_utils.gun_crimes import gun_crimes |
| 20 | +from new_etl.data_utils.drug_crimes import drug_crimes |
| 21 | +from new_etl.data_utils.delinquencies import delinquencies |
| 22 | +from new_etl.data_utils.unsafe_buildings import unsafe_buildings |
| 23 | +from new_etl.data_utils.imm_dang_buildings import imm_dang_buildings |
| 24 | +from new_etl.data_utils.tactical_urbanism import tactical_urbanism |
| 25 | +from new_etl.data_utils.conservatorship import conservatorship |
| 26 | +from new_etl.data_utils.owner_type import owner_type |
| 27 | +from new_etl.data_utils.community_gardens import community_gardens |
| 28 | +from new_etl.data_utils.park_priority import park_priority |
| 29 | +from new_etl.data_utils.ppr_properties import ppr_properties |
| 30 | + |
| 31 | +import pandas as pd |
| 32 | + |
| 33 | + |
| 34 | +# Ensure the directory containing awkde is in the Python path |
| 35 | +awkde_path = "/usr/src/app" |
| 36 | +if awkde_path not in sys.path: |
| 37 | + sys.path.append(awkde_path) |
| 38 | + |
| 39 | +services = [ |
| 40 | + # vacant designation |
| 41 | + vacant_properties, # needs to run early so that other utils can make use of the `vacant` designation |
| 42 | + # geometries/areas |
| 43 | + pwd_parcels, |
| 44 | + council_dists, |
| 45 | + nbhoods, |
| 46 | + rco_geoms, |
| 47 | + # ownership |
| 48 | + city_owned_properties, |
| 49 | + phs_properties, |
| 50 | + community_gardens, |
| 51 | + ppr_properties, |
| 52 | + owner_type, |
| 53 | + # quality of life |
| 54 | + li_violations, |
| 55 | + li_complaints, |
| 56 | + tree_canopy, |
| 57 | + gun_crimes, |
| 58 | + drug_crimes, |
| 59 | + delinquencies, |
| 60 | + unsafe_buildings, |
| 61 | + imm_dang_buildings, |
| 62 | + # development |
| 63 | + contig_neighbors, |
| 64 | + dev_probability, |
| 65 | + negligent_devs, |
| 66 | + # access/interventions |
| 67 | + tactical_urbanism, |
| 68 | + conservatorship, |
| 69 | + park_priority, |
| 70 | +] |
| 71 | + |
| 72 | +dataset = opa_properties() |
| 73 | + |
| 74 | +print("Initial Dataset:") |
| 75 | +print("Shape:", dataset.gdf.shape) |
| 76 | +print("Head:\n", dataset.gdf.head()) |
| 77 | +print("NA Counts:\n", dataset.gdf.isna().sum()) |
| 78 | + |
| 79 | +for service in services: |
| 80 | + dataset = service(dataset) |
| 81 | + print(f"After {service.__name__}:") |
| 82 | + print("Dataset type:", type(dataset.gdf).__name__) |
| 83 | + print("Shape:", dataset.gdf.shape) |
| 84 | + print("Head:\n", dataset.gdf.head()) |
| 85 | + print("NA Counts:\n", dataset.gdf.isna().sum()) |
| 86 | + |
| 87 | +before_drop = dataset.gdf.shape[0] |
| 88 | +dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id") |
| 89 | +after_drop = dataset.gdf.shape[0] |
| 90 | +print( |
| 91 | + f"Duplicate dataset rows dropped after initial services: {before_drop - after_drop}" |
| 92 | +) |
| 93 | + |
| 94 | +# Add Priority Level |
| 95 | +dataset = priority_level(dataset) |
| 96 | + |
| 97 | +# Print the distribution of "priority_level" |
| 98 | +distribution = dataset.gdf["priority_level"].value_counts() |
| 99 | +print("Distribution of priority level:") |
| 100 | +print(distribution) |
| 101 | + |
| 102 | +# Add Access Process |
| 103 | +dataset = access_process(dataset) |
| 104 | + |
| 105 | +# Print the distribution of "access_process" |
| 106 | +distribution = dataset.gdf["access_process"].value_counts() |
| 107 | +print("Distribution of access process:") |
| 108 | +print(distribution) |
| 109 | + |
| 110 | +before_drop = dataset.gdf.shape[0] |
| 111 | +dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id") |
| 112 | +after_drop = dataset.gdf.shape[0] |
| 113 | +print(f"Duplicate final dataset rows droppeds: {before_drop - after_drop}") |
| 114 | + |
| 115 | +# Convert problematic columns to numeric |
| 116 | +numeric_columns = [ |
| 117 | + "market_value", |
| 118 | + "sale_price", |
| 119 | + "total_assessment", |
| 120 | + "total_due", |
| 121 | + "num_years_owed", |
| 122 | + "permit_count", |
| 123 | +] |
| 124 | +for col in numeric_columns: |
| 125 | + dataset.gdf[col] = pd.to_numeric(dataset.gdf[col], errors="coerce") |
| 126 | + |
| 127 | +dataset.gdf["most_recent_year_owed"] = dataset.gdf["most_recent_year_owed"].astype(str) |
| 128 | + |
| 129 | +print("Column data types before exporting to Parquet:") |
| 130 | +print(dataset.gdf.dtypes) |
| 131 | + |
| 132 | +# Quick dataset profiling |
| 133 | +print("\nQuick dataset profile:") |
| 134 | + |
| 135 | +# 1) Number of NA values per column |
| 136 | +print("\nNumber of NA values per column:") |
| 137 | +print(dataset.gdf.isna().sum()) |
| 138 | + |
| 139 | +# 2) Mean, median, and std of numeric columns |
| 140 | +print("\nMean, Median, and Standard Deviation of numeric columns:") |
| 141 | +numeric_columns = dataset.gdf.select_dtypes(include=["float", "int"]).columns |
| 142 | + |
| 143 | +for column in numeric_columns: |
| 144 | + mean = dataset.gdf[column].mean() |
| 145 | + median = dataset.gdf[column].median() |
| 146 | + std = dataset.gdf[column].std() |
| 147 | + print(f"{column}:\n Mean: {mean:.2f}\n Median: {median:.2f}\n Std: {std:.2f}") |
| 148 | + |
| 149 | +# 3) Number of unique values in string columns |
| 150 | +print("\nNumber of unique values in string columns:") |
| 151 | +string_columns = dataset.gdf.select_dtypes(include=["object", "string"]).columns |
| 152 | +unique_values = dataset.gdf[string_columns].nunique() |
| 153 | +print(unique_values) |
| 154 | + |
| 155 | +dataset.gdf.to_parquet("tmp/test_output.parquet") |
0 commit comments