Skip to content

Commit 39af6af

Browse files
authored
Merge pull request #1023 from CodeForPhilly/staging
Weekly PR from Staging to Main
2 parents a7aa7b7 + 02c7dad commit 39af6af

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+2527
-1049
lines changed

data/docker-compose.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ services:
66
image: vacant-lots-proj:latest
77
environment:
88
- GOOGLE_APPLICATION_CREDENTIALS=/app/service-account-key.json
9-
- CFP_MAPBOX_TOKEN_UPLOADER
109
- VACANT_LOTS_DB
1110
- CLEAN_GREEN_GOOGLE_KEY
1211
- PYTHONUNBUFFERED=1

data/src/config/config.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,6 @@
88
USE_CRS = "EPSG:2272"
99
""" the standard geospatial code for Pennsylvania South (ftUS) """
1010

11-
MAPBOX_TOKEN = os.environ.get("CFP_MAPBOX_TOKEN_UPLOADER")
12-
""" The location of the token for your mapbox account in your environment """
13-
1411
log_level: int = logging.WARN
1512
""" overall log level for the project """
1613

data/src/main.py

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
import sys
2+
3+
from new_etl.data_utils.access_process import access_process
4+
from new_etl.data_utils.contig_neighbors import contig_neighbors
5+
from new_etl.data_utils.dev_probability import dev_probability
6+
from new_etl.data_utils.negligent_devs import negligent_devs
7+
from new_etl.data_utils.opa_properties import opa_properties
8+
from new_etl.data_utils.priority_level import priority_level
9+
from new_etl.data_utils.vacant_properties import vacant_properties
10+
from new_etl.data_utils.pwd_parcels import pwd_parcels
11+
from new_etl.data_utils.city_owned_properties import city_owned_properties
12+
from new_etl.data_utils.phs_properties import phs_properties
13+
from new_etl.data_utils.li_violations import li_violations
14+
from new_etl.data_utils.li_complaints import li_complaints
15+
from new_etl.data_utils.rco_geoms import rco_geoms
16+
from new_etl.data_utils.council_dists import council_dists
17+
from new_etl.data_utils.tree_canopy import tree_canopy
18+
from new_etl.data_utils.nbhoods import nbhoods
19+
from new_etl.data_utils.gun_crimes import gun_crimes
20+
from new_etl.data_utils.drug_crimes import drug_crimes
21+
from new_etl.data_utils.delinquencies import delinquencies
22+
from new_etl.data_utils.unsafe_buildings import unsafe_buildings
23+
from new_etl.data_utils.imm_dang_buildings import imm_dang_buildings
24+
from new_etl.data_utils.tactical_urbanism import tactical_urbanism
25+
from new_etl.data_utils.conservatorship import conservatorship
26+
from new_etl.data_utils.owner_type import owner_type
27+
from new_etl.data_utils.community_gardens import community_gardens
28+
from new_etl.data_utils.park_priority import park_priority
29+
from new_etl.data_utils.ppr_properties import ppr_properties
30+
31+
import pandas as pd
32+
33+
34+
# Ensure the directory containing awkde is in the Python path
35+
awkde_path = "/usr/src/app"
36+
if awkde_path not in sys.path:
37+
sys.path.append(awkde_path)
38+
39+
services = [
40+
# vacant designation
41+
vacant_properties, # needs to run early so that other utils can make use of the `vacant` designation
42+
# geometries/areas
43+
pwd_parcels,
44+
council_dists,
45+
nbhoods,
46+
rco_geoms,
47+
# ownership
48+
city_owned_properties,
49+
phs_properties,
50+
community_gardens,
51+
ppr_properties,
52+
owner_type,
53+
# quality of life
54+
li_violations,
55+
li_complaints,
56+
tree_canopy,
57+
gun_crimes,
58+
drug_crimes,
59+
delinquencies,
60+
unsafe_buildings,
61+
imm_dang_buildings,
62+
# development
63+
contig_neighbors,
64+
dev_probability,
65+
negligent_devs,
66+
# access/interventions
67+
tactical_urbanism,
68+
conservatorship,
69+
park_priority,
70+
]
71+
72+
dataset = opa_properties()
73+
74+
print("Initial Dataset:")
75+
print("Shape:", dataset.gdf.shape)
76+
print("Head:\n", dataset.gdf.head())
77+
print("NA Counts:\n", dataset.gdf.isna().sum())
78+
79+
for service in services:
80+
dataset = service(dataset)
81+
print(f"After {service.__name__}:")
82+
print("Dataset type:", type(dataset.gdf).__name__)
83+
print("Shape:", dataset.gdf.shape)
84+
print("Head:\n", dataset.gdf.head())
85+
print("NA Counts:\n", dataset.gdf.isna().sum())
86+
87+
before_drop = dataset.gdf.shape[0]
88+
dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
89+
after_drop = dataset.gdf.shape[0]
90+
print(
91+
f"Duplicate dataset rows dropped after initial services: {before_drop - after_drop}"
92+
)
93+
94+
# Add Priority Level
95+
dataset = priority_level(dataset)
96+
97+
# Print the distribution of "priority_level"
98+
distribution = dataset.gdf["priority_level"].value_counts()
99+
print("Distribution of priority level:")
100+
print(distribution)
101+
102+
# Add Access Process
103+
dataset = access_process(dataset)
104+
105+
# Print the distribution of "access_process"
106+
distribution = dataset.gdf["access_process"].value_counts()
107+
print("Distribution of access process:")
108+
print(distribution)
109+
110+
before_drop = dataset.gdf.shape[0]
111+
dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
112+
after_drop = dataset.gdf.shape[0]
113+
print(f"Duplicate final dataset rows droppeds: {before_drop - after_drop}")
114+
115+
# Convert problematic columns to numeric
116+
numeric_columns = [
117+
"market_value",
118+
"sale_price",
119+
"total_assessment",
120+
"total_due",
121+
"num_years_owed",
122+
"permit_count",
123+
]
124+
for col in numeric_columns:
125+
dataset.gdf[col] = pd.to_numeric(dataset.gdf[col], errors="coerce")
126+
127+
dataset.gdf["most_recent_year_owed"] = dataset.gdf["most_recent_year_owed"].astype(str)
128+
129+
print("Column data types before exporting to Parquet:")
130+
print(dataset.gdf.dtypes)
131+
132+
# Quick dataset profiling
133+
print("\nQuick dataset profile:")
134+
135+
# 1) Number of NA values per column
136+
print("\nNumber of NA values per column:")
137+
print(dataset.gdf.isna().sum())
138+
139+
# 2) Mean, median, and std of numeric columns
140+
print("\nMean, Median, and Standard Deviation of numeric columns:")
141+
numeric_columns = dataset.gdf.select_dtypes(include=["float", "int"]).columns
142+
143+
for column in numeric_columns:
144+
mean = dataset.gdf[column].mean()
145+
median = dataset.gdf[column].median()
146+
std = dataset.gdf[column].std()
147+
print(f"{column}:\n Mean: {mean:.2f}\n Median: {median:.2f}\n Std: {std:.2f}")
148+
149+
# 3) Number of unique values in string columns
150+
print("\nNumber of unique values in string columns:")
151+
string_columns = dataset.gdf.select_dtypes(include=["object", "string"]).columns
152+
unique_values = dataset.gdf[string_columns].nunique()
153+
print(unique_values)
154+
155+
dataset.gdf.to_parquet("tmp/test_output.parquet")

data/src/new_etl/__init__.py

Whitespace-only changes.

data/src/new_etl/classes/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)