Skip to content

Commit

Permalink
Rename script descarga_opendata.py and updates on timeout, also some …
Browse files Browse the repository at this point in the history
…minor changes to add column with the origin of metadata
  • Loading branch information
Carlos González Gamella committed Nov 13, 2024
1 parent 4d37862 commit 68c5e46
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -233,13 +233,13 @@ def main():

### Ejemplos de Uso
#Descargar contratos menores de Zaragoza y guardarlos en un archivo JSON:
#python3 script.py zaragoza --ruta_archivo /path/to/contratos_menores_zaragoza.json
#python3 descarga_opendata.py zaragoza --ruta_archivo /path/to/contratos_menores_zaragoza.json

# Descargar contratos menores de Zaragoza
#python3 script.py zaragoza --file_path /path/to/contratos_menores_zaragoza.json
#python3 descarga_opendata.py zaragoza --file_path /path/to/contratos_menores_zaragoza.json
# Descargar contratos menores de Madrid desde 2018
#python3 script.py madrid --start_year 2018 --file_path /path/to/contratos_menores_madrid
#python3 descarga_opendata.py madrid --start_year 2018 --file_path /path/to/contratos_menores_madrid
# Descargar datos de contratación pública de Cataluña
#python3 script.py gencat --file_path /path/to/contratacion_publica_catalunya.csv
#python3 descarga_opendata.py gencat --file_path /path/to/contratacion_publica_catalunya.csv
# Descargar todos los conjuntos de datos
#python3 script.py all --file_path /path/to/descargas
#python3 descarga_opendata.py all --file_path /path/to/descargas
17 changes: 9 additions & 8 deletions integracion_opendata.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ def process_zaragoza(df, df_minors):
# Renombrar las columnas
df_minors_zgz = df_filtered.rename(columns=mapeo_zgz)
print(f"Hay un total de {len(df_minors_zgz)} menores de ZARAGOZA")
df_minors_zgz['id'] = 'zaragoza_opendata'
df_minors_zgz['origen'] = 'zaragoza_opendata'

df_combined_minors_zgz = pd.concat([df_minors, df_minors_zgz], ignore_index=True)

Expand Down Expand Up @@ -611,9 +611,9 @@ def process_madrid(df_minors_base, input_dir):
df_bloque2.drop(columns=['CONTRATO','SECCIÓN','F_INSCRIPCION'], inplace=True, errors='ignore')
df_bloque3.drop(columns=['NºRECON','SECCION ', 'FECHA APROBACION', 'FCH.COMUNIC.REG'], inplace=True, errors='ignore')

df_bloque1['id'] = 'madrid_opendata'
df_bloque2['id'] = 'madrid_opendata'
df_bloque3['id'] = 'madrid_opendata'
df_bloque1['origen'] = 'madrid_opendata'
df_bloque2['origen'] = 'madrid_opendata'
df_bloque3['origen'] = 'madrid_opendata'

# Concatenar todos los DataFrames
dataframes = [df_bloque1, df_bloque2, df_bloque3]
Expand Down Expand Up @@ -1190,15 +1190,16 @@ def replace_procedure_code(value):
df_menores_ren['ContractFolderStatus.TenderingProcess.UrgencyCode'] = df_menores_ren['ContractFolderStatus.TenderingProcess.UrgencyCode'].apply(replace_procedure_code)
#import pdb; pdb.set_trace()

df_menores_ren['id'] = 'minors_gencat_opendata'
df_menores_ren['id'] = df_menores_ren['id'].apply(convert_to_object_array)
df_menores_ren['origen'] = 'minors_gencat_opendata'

logging.info(f"Las cols de place de contratos menores son: {df_minors_base.columns.tolist()}")
# Combinar con df_minors_base
# Aqui esta vacío df_minors_base, pero debería ser el df de minors
df_minors_combined = pd.concat([df_minors_base, df_menores_ren], ignore_index=True)
logging.info(f"Total de contratos menores después de combinar: {df_minors_combined.shape[0]} filas.")

df_menores_ren['origen'] = df_menores_ren['origen'].apply(convert_to_object_array)

# Unificar tipos de datos para guardar como parquet
df_minors_combined['ContractFolderStatus.LocatedContractingParty.Party.PartyIdentification.ID'] = \
df_minors_combined['ContractFolderStatus.LocatedContractingParty.Party.PartyIdentification.ID'].apply(convert_to_object_array)
Expand Down Expand Up @@ -1397,9 +1398,9 @@ def replace_procedure_code(value):
df_outsiders_all['ContractFolderStatus.TenderingTerms.FundingProgramCode']= \
df_outsiders_all['ContractFolderStatus.TenderingTerms.FundingProgramCode'].apply(convert_to_object_array)

df_outsiders_all['id'] = 'outsiders_gencat_opendata'
df_outsiders_all['origen'] = 'outsiders_gencat_opendata'

df_outsiders_all['id'] = df_outsiders_all['id'].apply(convert_to_object_array)
df_outsiders_all['origen'] = df_outsiders_all['origen'].apply(convert_to_object_array)

#import pdb; pdb.set_trace()
df_outsiders_all.drop(columns=cols_drop, inplace=True, errors='ignore')
Expand Down
4 changes: 2 additions & 2 deletions sproc/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
def file(
url: str, # URL for the file to be downloaded
output_file: str | pathlib.Path | None, # Name of the local file to be saved; if `None` its content is returned
timeout: float = 2. # How long to wait for a response
timeout: float = 5. # How long to wait for a response
) -> None | bytes: # Content of the file or `None` if `output_file` was passed
"Downloads a file"

Expand Down Expand Up @@ -57,7 +57,7 @@ def file(
# %% ../nbs/80_download.ipynb 13
def yaml_to_dict(
url: str, # URL for the file to be downloaded
timeout: float = 2. # How long to wait for a response
timeout: float = 5. # How long to wait for a response
) -> dict: # YAML data
"Read YAML data from an URL"

Expand Down

0 comments on commit 68c5e46

Please sign in to comment.