Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

s_1603 #1242

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open

s_1603 #1242

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/us_census/acs5yr/subject_tables/S1603/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

This import gives the count of people based on the language spoken at home. It breaks this further down based on age, nativity and citizenship status, poverty status and educational attainment.

Years: 2010-2019
Years: 2010-2023
Geo : Country, State, County and Place

Important Notes :
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -301,4 +301,5 @@
"Percent speak only English at home MOE",
"PERCENT IMPUTED"
]
}
}

75 changes: 73 additions & 2 deletions scripts/us_census/acs5yr/subject_tables/S1603/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import sys
import pandas as pd
import numpy as np
from absl import app, flags
import json

# pylint: disable=missing-module-docstring
# pylint: disable=missing-class-docstring
Expand All @@ -11,12 +13,37 @@
#pylint: disable=wrong-import-position
#pylint: disable=import-error
_CODEDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(1, os.path.join(_CODEDIR, '..', 'common'))
sys.path.insert(1, os.path.join(_CODEDIR, '../', 'common'))

from data_loader import SubjectTableDataLoaderBase # commit hash - aee443ee
from resolve_geo_id import convert_to_place_dcid
from generate_col_map import process_zip_file
#pylint: enable=wrong-import-position
#pylint: enable=import-error
FLAGS = flags.FLAGS
flags.DEFINE_string(
'option', 'all',
'Specify how to run the process, colmap -- generates column map, process -- runs processing, all -- runs colmap first and then proessing'
)
flags.DEFINE_string(
'table_prefix', None,
'[for processing]Subject Table ID as a prefix for output files, eg: S2702')
flags.DEFINE_string('spec_path', None, 'Path to the JSON spec [mandatory]')
flags.DEFINE_string(
'input_path', None,
'Path to input directory with (current support only for zip files)')
flags.DEFINE_string('output_dir', './', 'Path to the output directory')
# flags.DEFINE_string(
# 'output_dir',
# '/Users/sanikap/Desktop/ACS/data/scripts/us_census/acs5yr/output_files/',
# 'Path to the output directory')
flags.DEFINE_boolean(
'has_percent', False,
'[for processing]Specify the datasets has percentage values that needs to be convered to counts'
)
flags.DEFINE_boolean(
'debug', False,
'[for processing]set the flag to add additional columns to debug')


class S1603SubjectTableDataLoader(SubjectTableDataLoaderBase):
Expand Down Expand Up @@ -57,7 +84,11 @@ def _process_dataframe(self, df, filename):
}

csv_file = open(self.clean_csv_path, 'a')
place_geoIds = df['id'].apply(convert_to_place_dcid)
column_headers = list(df.columns.values)
if 'Geography' in column_headers:
place_geoIds = df['Geography'].apply(convert_to_place_dcid)
else:
place_geoIds = df['id'].apply(convert_to_place_dcid)

# update the clean csv
for column in df.columns.tolist():
Expand Down Expand Up @@ -126,3 +157,43 @@ def _process_dataframe(self, df, filename):
print(
f" Completed with { self.counter_dict[year]['number of observations'] } observation for { self.counter_dict[year]['number of unique StatVars with observations'] } StatVars at { self.counter_dict[year]['number of unique geos'] } places. ",
flush=True)


def set_column_map(input_path, spec_path, output_dir):
generated_col_map = process_zip_file(input_path,
spec_path,
write_output=False)
f = open(os.path.join(output_dir, 'column_map.json'), 'w')
json.dump(generated_col_map, f, indent=4)
f.close()


def main(argv):
option = FLAGS.option.lower()
table_prefix = FLAGS.table_prefix
spec_path = FLAGS.spec_path
input_path = FLAGS.input_path
output_dir = FLAGS.output_dir
has_percent = FLAGS.has_percent
debug = FLAGS.debug

set_column_map(input_path, spec_path, output_dir)
data_loader = S1603SubjectTableDataLoader(table_id='S1603',
col_delimiter='!!',
has_percent=True,
debug=True,
output_path_dir=output_dir,
json_spec=spec_path,
column_map_path=os.path.join(
output_dir,
'column_map.json'),
decimal_places=3,
estimate_period=5,
header_row=1)
data_loader._process_zip_file(input_path)


if __name__ == '__main__':
flags.mark_flags_as_required(
['table_prefix', 'spec_path', 'input_path', 'output_dir'])
app.run(main)
Binary file not shown.
Binary file not shown.
Loading
Loading