From 0178ac22cd7f2324039b91d688761c61fa19cf86 Mon Sep 17 00:00:00 2001 From: hangy Date: Sun, 22 Dec 2024 15:44:33 +0100 Subject: [PATCH] fix: Correct indentation, so that CodeQL can work with the code (#11166) style: Reformat all Python code with PEP8 for normalization CodeQL reported an error with a Python file because of a compilation error: https://github.com/openfoodfacts/openfoodfacts-server/security/code-scanning/tools/CodeQL/status/configurations/actions-FZTWS5DIOVRC653POJVWM3DPO5ZS6Y3PMRSXC3BNMFXGC3DZONUXGLTZNVWA/81b016e0e16ad77829222f64441894263139370096f7c96cdedadd2642eb9d37 --- .../get_packager_code_from_html_ireland.py | 110 ++++--- scripts/generate_dump_for_offline_apps.py | 18 +- scripts/mappingGES.py | 96 +++--- .../at_packagers_refresh_part1.py | 43 ++- .../at_packagers_refresh_part2.py | 18 +- .../packager-codes/cy_packagers_refresh.py | 81 +++-- .../cz_packagers_refresh_part1.py | 18 +- .../cz_packagers_refresh_part2.py | 33 +- .../dk_packagers_refresh_part1.py | 8 +- .../dk_packagers_refresh_part2.py | 25 +- .../hu_packagers_refresh_part1.py | 32 +- .../hu_packagers_refresh_part2.py | 27 +- .../ie_packagers_refresh_part1.py | 61 ++-- .../ie_packagers_refresh_part2.py | 83 ++--- .../packager-codes/it_packagers_refresh.py | 33 +- .../lu_packagers_refresh_part1.py | 17 +- .../lu_packagers_refresh_part2.py | 27 +- .../packager-codes/non-eu/non_eu_spider.py | 15 +- .../packager-codes/non-eu/packager_codes.py | 12 +- .../packager-codes/non-eu/pdf_extraction.py | 6 +- .../packager-codes/poland_packager_code.py | 4 +- .../portugal-concatenate-csv-sections.py | 71 ++-- .../packager-codes/si-packagers-refresh.py | 306 +++++++++--------- scripts/run_ocr.py | 12 +- scripts/snippets/agribalyse_categories_csv.py | 15 +- scripts/update_tags_per_languages.py | 147 +++++---- scripts/update_tags_per_languages_tests.py | 54 ++-- .../keep_most_recent_row_for_each_product.py | 7 +- 28 files changed, 778 insertions(+), 601 deletions(-) diff --git a/packager-codes/get_packager_code_from_html_ireland.py b/packager-codes/get_packager_code_from_html_ireland.py index 52e98a41ad61d..18b9829ddcadf 100644 --- a/packager-codes/get_packager_code_from_html_ireland.py +++ b/packager-codes/get_packager_code_from_html_ireland.py @@ -8,114 +8,118 @@ # In[]: +import pandas as pd urls = ['https://oapi.fsai.ie/LAApprovedEstablishments.aspx', 'https://oapi.fsai.ie/AuthReg99901Establishments.aspx', 'https://oapi.fsai.ie/HSEApprovedEstablishments.aspx' - ] + ] urls_second_format = ['https://www.sfpa.ie/Seafood-Safety/Registration-Approval-of-Businesses/List-of-Approved-Establishments-and-Vessels/Approved-Establishments', 'https://www.sfpa.ie/Seafood-Safety/Registration-Approval-of-Businesses/Approved-Freezer-Vessels' - ] + ] csv_file = 'Ireland_concatenated.csv' -import pandas as pd pages = [pd.read_html(url) for url in urls] -pages2= [pd.read_html(url) for url in urls_second_format] +pages2 = [pd.read_html(url) for url in urls_second_format] # In[]: -def ireland_correction_of_1_dataframe(df): #Version to get anything - #print ("df as recuperated :") - #print(df.head()) +def ireland_correction_of_1_dataframe(df): # Version to get anything + # print ("df as recuperated :") + # print(df.head()) df.columns = df.iloc[[0]].values.tolist() df = df.rename(columns={' Address': 'Address'}) - df=df.drop(df.index[0]) # + df = df.drop(df.index[0]) row_reference = df.iloc[0] if 'Approval_Number' not in df.columns: print("this table has no approval number and was not added") return pd.DataFrame() - df_is_null=df.isnull() - for i in range(1,len(df)): #len(df) - if df_is_null.iloc[i,len(df.columns)-1]: #We assume that on a row, there is no merged cell(null in pandas) on the webpage after an unmerged cell (not null) - row_retrieved=[] + df_is_null = df.isnull() + for i in range(1, len(df)): # len(df) + # We assume that on a row, there is no merged cell(null in pandas) on the webpage after an unmerged cell (not null) + if df_is_null.iloc[i, len(df.columns)-1]: + row_retrieved = [] value = "" - j=0 - while not df_is_null.iloc[i,j]: - value=df.iloc[i,j] + j = 0 + while not df_is_null.iloc[i, j]: + value = df.iloc[i, j] row_retrieved.append(value) - #print("while loop - j:"+str(j)+ "value : "+str(value)) - j+=1 + # print("while loop - j:"+str(j)+ "value : "+str(value)) + j += 1 row = row_reference.copy() - row[len(row)-len(row_retrieved):len(row)]=row_retrieved - df.iloc[i]= row - - row_reference =df.iloc[i] + row[len(row)-len(row_retrieved):len(row)] = row_retrieved + df.iloc[i] = row + row_reference = df.iloc[i] - df["Address"]=df["Address"].apply(add_space_before_uppercase) + df["Address"] = df["Address"].apply(add_space_before_uppercase) - #print ("result corrected : ") - #print(df.head()) + # print ("result corrected : ") + # print(df.head()) return df -#df=pages[0][18] -#ireland_correction_of_1_dataframe(df) +# df=pages[0][18] +# ireland_correction_of_1_dataframe(df) # In[]: def add_space_before_uppercase(words): - result="" - for s in words: - if isinstance(s, str): - if s.isupper(): - result+=" " - result+=s - return result + result = "" + for s in words: + if isinstance(s, str): + if s.isupper(): + result += " " + result += s + return result + + """ This could have been done more efficienty using Regex r"[a-z][A-Z]"" and avoid r" [A-Z]". But google maps recognize it this way.""" # In[ ]: -df=pd.DataFrame() +df = pd.DataFrame() # In[]: -i=0 +i = 0 for page in pages: - j=0 + j = 0 for table in page: - df=df.append(ireland_correction_of_1_dataframe(table), ignore_index=True) - #print ("table "+str(j)+" is ok") - #j+=1 - print ("page "+str(i)+" is done") - i+=1 + df = df.append(ireland_correction_of_1_dataframe( + table), ignore_index=True) + # print ("table "+str(j)+" is ok") + # j+=1 + print("page "+str(i)+" is done") + i += 1 print("finished for all in urls!") # In[]: -i=0 +i = 0 for page2 in pages2: - j=0 + j = 0 for table in page2: - #print (table.head(3)) - table=table.drop(table.index[0]) - table.loc[0,0]='Approval_Number' - #print (ireland_correction_of_1_dataframe(table).head()) - df=df.append(ireland_correction_of_1_dataframe(table), ignore_index=True) - print ("table "+str(j)+" is ok") - j+=1 - print ("page "+str(i)+" is done") - i+=1 + # print (table.head(3)) + table = table.drop(table.index[0]) + table.loc[0, 0] = 'Approval_Number' + # print (ireland_correction_of_1_dataframe(table).head()) + df = df.append(ireland_correction_of_1_dataframe( + table), ignore_index=True) + print("table "+str(j)+" is ok") + j += 1 + print("page "+str(i)+" is done") + i += 1 print("finished for table in urls_second_format!") # In[]: -df.to_csv(csv_file, index = False) +df.to_csv(csv_file, index=False) diff --git a/scripts/generate_dump_for_offline_apps.py b/scripts/generate_dump_for_offline_apps.py index 426b13caefba0..f2517244481ec 100644 --- a/scripts/generate_dump_for_offline_apps.py +++ b/scripts/generate_dump_for_offline_apps.py @@ -3,6 +3,7 @@ import os import pandas + def main(): if not (os.getenv('OFF_PUBLIC_DATA_DIR') and os.getenv('PRODUCT_OPENER_FLAVOR') and os.getenv('PRODUCT_OPENER_FLAVOR_SHORT')): print("Environment variables OFF_PUBLIC_DATA_DIR, PRODUCT_OPENER_FLAVOR and PRODUCT_OPENER_FLAVOR_SHORT are required") @@ -13,15 +14,18 @@ def main(): if not os.path.exists(off_public_data_dir + '/offline'): os.makedirs(off_public_data_dir + '/offline') - - df = pandas.read_csv(off_public_data_dir + '/en.' + product_opener_flavor + '.org.products.csv', sep='\t', low_memory=False) - colnames = ['code','product_name','quantity','brands'] + + df = pandas.read_csv(off_public_data_dir + '/en.' + product_opener_flavor + + '.org.products.csv', sep='\t', low_memory=False) + colnames = ['code', 'product_name', 'quantity', 'brands'] # add 'nutriscore_grade','nova_group','environmental_score_grade' columns if the flavor is off if product_opener_flavor_short == 'off': - colnames = colnames + ['nutriscore_grade','nova_group','environmental_score_grade'] + colnames = colnames + ['nutriscore_grade', + 'nova_group', 'environmental_score_grade'] + + df.rename(columns={'nutriscore_grade': 'nutrition_grade_fr'}).to_csv(off_public_data_dir + '/offline/en.' + + product_opener_flavor + '.org.products.small.csv', columns=colnames, sep='\t', index=False) + - df.rename(columns={'nutriscore_grade': 'nutrition_grade_fr'}).to_csv(off_public_data_dir + '/offline/en.' + product_opener_flavor + '.org.products.small.csv', columns = colnames,sep='\t',index=False) - if __name__ == '__main__': main() - diff --git a/scripts/mappingGES.py b/scripts/mappingGES.py index df06f7d25deb3..db58c6d7283eb 100644 --- a/scripts/mappingGES.py +++ b/scripts/mappingGES.py @@ -14,70 +14,76 @@ temporary_exists = os.path.isfile(PATH_TO_TEMPORARY) if temporary_exists: - print "The temporary file already exists" - exit() + print "The temporary file already exists" + exit() ingredients_exists = os.path.isfile(PATH_TO_INGREDIENTS) if not ingredients_exists: - print "The ingredient file does not exist, check the path :" + PATH_TO_INGREDIENTS - exit() + print "The ingredient file does not exist, check the path :" + PATH_TO_INGREDIENTS + exit() foodGES_exists = os.path.isfile(PATH_TO_FOODGES) if not foodGES_exists: - print "The foodGES file does not exist, check the path :" + PATH_TO_FOODGES - exit() + print "The foodGES file does not exist, check the path :" + PATH_TO_FOODGES + exit() + def check_next_lines(ingredients): - next_line_is_not_foodges = True - keep_lines = [] - while next_line_is_not_foodges: - next_line = ingredients.readline() - keep_lines.append(next_line) - if STRING_FOODGES_VALUE not in next_line and STRING_FOODGES_INGREDIENT not in next_line: - next_line_is_not_foodges = False - return keep_lines + next_line_is_not_foodges = True + keep_lines = [] + while next_line_is_not_foodges: + next_line = ingredients.readline() + keep_lines.append(next_line) + if STRING_FOODGES_VALUE not in next_line and STRING_FOODGES_INGREDIENT not in next_line: + next_line_is_not_foodges = False + return keep_lines + def write_next_lines(next_lines, temporary_file): - size = len(next_lines) - for i in range(0, size-1): - line = next_lines[i] - if STRING_FOODGES_INGREDIENT in line: - temporary_file.write(line) - if line.rstrip("\n") not in dict: - print("this mapping is not known : " + line.rstrip("\n")) - else: - temporary_file.write(STRING_FOODGES_VALUE + dict.get(line.rstrip("\n")) + "\n") - if line.rstrip("\n") in unused_mappings: - unused_mappings.remove(line.rstrip("\n")) - temporary_file.write(next_lines[size-1]) + size = len(next_lines) + for i in range(0, size-1): + line = next_lines[i] + if STRING_FOODGES_INGREDIENT in line: + temporary_file.write(line) + if line.rstrip("\n") not in dict: + print("this mapping is not known : " + line.rstrip("\n")) + else: + temporary_file.write( + STRING_FOODGES_VALUE + dict.get(line.rstrip("\n")) + "\n") + if line.rstrip("\n") in unused_mappings: + unused_mappings.remove(line.rstrip("\n")) + temporary_file.write(next_lines[size-1]) + with open(PATH_TO_FOODGES, 'r') as csvFile: - reader = csv.reader(csvFile) - for row in reader: - dict[row[2]]=row[1] - unused_mappings.append(row[2]) + reader = csv.reader(csvFile) + for row in reader: + dict[row[2]] = row[1] + unused_mappings.append(row[2]) csvFile.close() -temporary_file = open(PATH_TO_TEMPORARY,"w+") +temporary_file = open(PATH_TO_TEMPORARY, "w+") ingredients = file(PATH_TO_INGREDIENTS) while True: - line = ingredients.readline() - temporary_file.write(line) - if not line: break - if STRING_FOODGES_INGREDIENT in line: - if line.rstrip("\n") not in dict: - print("this mapping is not known : " + line.rstrip("\n")) - else: - temporary_file.write(STRING_FOODGES_VALUE + dict.get(line.rstrip("\n")) + "\n") - if line.rstrip("\n") in unused_mappings: - unused_mappings.remove(line.rstrip("\n")) - next_lines = check_next_lines(ingredients) - write_next_lines(next_lines, temporary_file) + line = ingredients.readline() + temporary_file.write(line) + if not line: + break + if STRING_FOODGES_INGREDIENT in line: + if line.rstrip("\n") not in dict: + print("this mapping is not known : " + line.rstrip("\n")) + else: + temporary_file.write(STRING_FOODGES_VALUE + + dict.get(line.rstrip("\n")) + "\n") + if line.rstrip("\n") in unused_mappings: + unused_mappings.remove(line.rstrip("\n")) + next_lines = check_next_lines(ingredients) + write_next_lines(next_lines, temporary_file) ingredients.close() -temporary_file.close() +temporary_file.close() os.remove(PATH_TO_INGREDIENTS) os.rename(PATH_TO_TEMPORARY, PATH_TO_INGREDIENTS) @@ -85,4 +91,4 @@ def write_next_lines(next_lines, temporary_file): print("\n") print "This is the list of unused mapping : " for mapping in unused_mappings: - print mapping + print mapping diff --git a/scripts/packager-codes/at_packagers_refresh_part1.py b/scripts/packager-codes/at_packagers_refresh_part1.py index dfeeb6d1a1693..0c59e972825a5 100644 --- a/scripts/packager-codes/at_packagers_refresh_part1.py +++ b/scripts/packager-codes/at_packagers_refresh_part1.py @@ -1,7 +1,7 @@ ''' This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France Product Opener is free software: you can redistribute it and/or modify @@ -79,7 +79,7 @@ def split_name_address(input_name_address: str, output: str) -> str: name += lines[0] for line in lines[1:]: line_split = line.split(',') - + if line_split[0][:4].isdigit(): address += ', ' + line_split[0] else: @@ -108,7 +108,8 @@ def read_all_pdf() -> pl.dataframe.frame.DataFrame: pdf_path = os.path.join(pdf_directory, filename) try: # Extract tables from each PDF page - tables = camelot.read_pdf(pdf_path, pages='all', flavor="stream") + tables = camelot.read_pdf( + pdf_path, pages='all', flavor="stream") file_dfs = [] for table in tables: @@ -116,30 +117,37 @@ def read_all_pdf() -> pl.dataframe.frame.DataFrame: # some rows have been split in 2 or 3 # to tackle it, 1) replace by the previous code when code is null - df_replace_null = df.with_columns(pl.all().replace("", None)) - df_fill_code = df_replace_null.with_columns(pl.col("0").fill_null(strategy="forward"),) + df_replace_null = df.with_columns( + pl.all().replace("", None)) + df_fill_code = df_replace_null.with_columns( + pl.col("0").fill_null(strategy="forward"),) # first row of the df are empty, select df without those - df_not_null = df_fill_code.filter(pl.any_horizontal(pl.col("0").is_not_null())) + df_not_null = df_fill_code.filter( + pl.any_horizontal(pl.col("0").is_not_null())) # 2) group by code and concat other columns df_grouped_by_code = df_not_null.group_by('0').agg( **{col: pl.col(col).str.concat(", ") for col in df.columns if col != '0'} ) # ignore rows if first column does not start by "AT " - df = df_grouped_by_code.filter(df_grouped_by_code['0'].str.starts_with("AT ")) + df = df_grouped_by_code.filter( + df_grouped_by_code['0'].str.starts_with("AT ")) - # select col before concat because on some pages two columns + # select col before concat because on some pages two columns # are merged as a single one by the extraction # resulting in different nb of columnes # case column 0 & column 1 are merged (column 0 contains identification number: AT 61898 EG8007004) column_1_suffix_check = df['0'].str.ends_with('EG').all() if not column_1_suffix_check: - updated_col = df['0'].str.split("EG", inclusive=True).list.first() + updated_col = df['0'].str.split( + "EG", inclusive=True).list.first() # other columns are shifted - df = df.with_columns(updated_col.alias('0'), pl.col('2').alias('3'), pl.col('1').alias('2')) + df = df.with_columns(updated_col.alias('0'), pl.col( + '2').alias('3'), pl.col('1').alias('2')) - column_1_suffix_double_check = df['0'].str.ends_with('EG').all() + column_1_suffix_double_check = df['0'].str.ends_with( + 'EG').all() if not column_1_suffix_double_check: print("error parsing first column: ") print(df.head(2)) @@ -147,9 +155,11 @@ def read_all_pdf() -> pl.dataframe.frame.DataFrame: # example: KOPP ANGELIKA UND STEFAN\nEhringstraße 41, [WEINZIERL]\n9412 Wolfsberg,... # the column 4 is always empty if df.filter(pl.col('3') != '').is_empty(): - # assume name1\naddres1, name2\naddress2, name3(\naddress3, name4) - df = df.with_columns(pl.col('2').map_elements(lambda x: split_name_address(x, 'address'), return_dtype=str).alias('3')) - df = df.with_columns(pl.col('2').map_elements(lambda x: split_name_address(x, 'name'), return_dtype=str).alias('2')) + # assume name1\naddres1, name2\naddress2, name3(\naddress3, name4) + df = df.with_columns(pl.col('2').map_elements( + lambda x: split_name_address(x, 'address'), return_dtype=str).alias('3')) + df = df.with_columns(pl.col('2').map_elements( + lambda x: split_name_address(x, 'name'), return_dtype=str).alias('2')) df = df.select(['0', '2', '3']) @@ -193,11 +203,9 @@ def clean_name(input_name: str) -> str: if '[' in input_name and ']' in input_name: input_name = input_name.split('[')[0].strip(', ') - input_name = input_name.replace(',,', ',') input_name = input_name.replace(', ,', ',') - return input_name @@ -206,7 +214,8 @@ def clean_name(input_name: str) -> str: new_column_names = ['code', 'name', 'address'] df_renamed = df.rename({i: j for i, j in zip(df.columns, new_column_names)}) -df_clean_name = df_renamed.with_columns(pl.col('name').map_elements(lambda x: clean_name(x), return_dtype=str)) +df_clean_name = df_renamed.with_columns( + pl.col('name').map_elements(lambda x: clean_name(x), return_dtype=str)) # rm duplicates df_deduplicated = df_clean_name.unique() diff --git a/scripts/packager-codes/at_packagers_refresh_part2.py b/scripts/packager-codes/at_packagers_refresh_part2.py index 97a2b9ccce058..d8dee5a620a35 100644 --- a/scripts/packager-codes/at_packagers_refresh_part2.py +++ b/scripts/packager-codes/at_packagers_refresh_part2.py @@ -1,7 +1,7 @@ ''' This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France Product Opener is free software: you can redistribute it and/or modify @@ -35,7 +35,8 @@ def extract_address_components(address_to_convert): print("warning missing address") elif len(address_split) < 2: if address_split[0][:4].isdigit(): - print("warning address without street name (only postcode and town):", address_split[0]) + print( + "warning address without street name (only postcode and town):", address_split[0]) post_and_town = address_split[0] if "," in post_and_town: postal_code = post_and_town.split()[0] @@ -81,7 +82,6 @@ def extract_address_components(address_to_convert): else: print("error to extract postcode and town:", address) - return street, postal_code, town @@ -136,9 +136,10 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: if street in url_2: url_3 = url_2.replace(f"street={street}&", "") else: - print(f'Empty response for before url_3: {address_to_convert}: {url_2}') + print(f'Empty response for before url_3: { + address_to_convert}: {url_2}') sys.exit(1) - + try: print("url_3", url_3) response = requests.get(url_3) @@ -146,7 +147,8 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: if data != []: lat, lng = data[0]['lat'], data[0]['lon'] else: - print(f'Empty response for: {address_to_convert}" {url_3}') + print(f'Empty response for: { + address_to_convert}" {url_3}') sys.exit(1) except (requests.exceptions.RequestException, KeyError, IndexError) as e: print(f"Error: {e}, url: {url}") @@ -164,7 +166,7 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: source_file = 'AT-merge-UTF-8_no_coord.csv' target_file = "AT-merge-UTF-8.csv" index_last_line_processed = 'at_packagers_refresh_part2_index_tmp.txt' -api_key = "" # TODO remove +api_key = "" # TODO remove data = [] @@ -193,7 +195,7 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: row += ['lat', 'lng'] else: row += convert_address_to_lat_lng(row[2]) - + writer.writerow(row) with open(index_last_line_processed, 'w') as f: diff --git a/scripts/packager-codes/cy_packagers_refresh.py b/scripts/packager-codes/cy_packagers_refresh.py index e78c4ec5468e3..c43ac5b7470fa 100644 --- a/scripts/packager-codes/cy_packagers_refresh.py +++ b/scripts/packager-codes/cy_packagers_refresh.py @@ -1,7 +1,7 @@ -''' +""" This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France Product Opener is free software: you can redistribute it and/or modify @@ -55,8 +55,7 @@ deactivate - delete all temporary files - update .sto file -''' - +""" import os import polars as pl @@ -66,20 +65,20 @@ def split_name_address(input_name_address: str, output: str) -> str: name = "" address = "" - lines = input_name_address.split('\n') + lines = input_name_address.split("\n") lines = [x.strip() for x in lines] name += lines[0] for line in lines[1:]: - line_split = line.split(',') - + line_split = line.split(",") + if line_split[0][:4].isdigit(): - address += ', ' + line_split[0] + address += ", " + line_split[0] else: address += line_split[0] name += ", ".join(line_split[1:]) - if output == 'name': + if output == "name": return name else: return address @@ -88,11 +87,13 @@ def split_name_address(input_name_address: str, output: str) -> str: import os import polars as pl + def read_csv_file(filename, pdf_directory) -> pl.dataframe.frame.DataFrame: pdf_path = os.path.join(pdf_directory, filename) df = pl.read_csv(filename, truncate_ragged_lines=True) return df + def process_dataframe(df) -> pl.dataframe.frame.DataFrame: columns_to_drop = get_columns_to_drop(df) df = df.drop(columns_to_drop) @@ -113,53 +114,83 @@ def process_dataframe(df) -> pl.dataframe.frame.DataFrame: return df + def get_columns_to_drop(df) -> list: columns_to_drop = [] for i in range(len(df.columns)): if df[df.columns[i]].null_count() == df[df.columns[i]].len(): columns_to_drop.append(df.columns[i]) - if df[df.columns[i]].dtype == pl.String and (df[df.columns[i]].str.len_chars() == 0).all(): + if ( + df[df.columns[i]].dtype == pl.String + and (df[df.columns[i]].str.len_chars() == 0).all() + ): columns_to_drop.append(df.columns[i]) return columns_to_drop + def remove_header_inside_column(df) -> pl.dataframe.frame.DataFrame: return df.filter(pl.col(df.columns[1]) != df.columns[1]) + def transform_column(df) -> pl.dataframe.frame.DataFrame: - return df.with_columns(pl.col(df.columns[1]).map_elements(lambda x: "CY " + x if not x.startswith('CY') else x, return_dtype=str)) - .with_columns(pl.col(df.columns[1]).map_elements(lambda x: x.replace('CY', 'CY ') if not x.startswith('CY ') else x, return_dtype=str)) + return df.with_columns( + pl.col(df.columns[1]).map_elements( + lambda x: "CY " + x if not x.startswith("CY") else x, + return_dtype=str, + ) + ).with_columns( + pl.col(df.columns[1]).map_elements( + lambda x: x.replace("CY", "CY ") if not x.startswith("CY ") else x, + return_dtype=str, + ) + ) + def handle_split_columns(df) -> pl.dataframe.frame.DataFrame: name_prefix = df[df.columns[1]].str.starts_with("CY").all() name_length = (df[df.columns[1]].str.len_chars() == 3).all() if name_prefix and name_length: - df = df.with_columns((pl.col(df.columns[1]) + " " + pl.col(df.columns[2])).alias(df.columns[1])) + df = df.with_columns( + (pl.col(df.columns[1]) + " " + pl.col(df.columns[2])).alias( + df.columns[1] + ) + ) df = df.drop(df.columns[2]) return df + def remove_new_line_characters(df) -> pl.dataframe.frame.DataFrame: for column in df.columns[1:4]: - df = df.with_columns(pl.col(column).str.replace_all('\n', ' ').str.replace_all('\r', ' ').str.replace_all(' ', ' ')) + df = df.with_columns( + pl.col(column) + .str.replace_all("\n", " ") + .str.replace_all("\r", " ") + .str.replace_all(" ", " ") + ) return df + def select_and_rename_columns(df) -> pl.dataframe.frame.DataFrame: df = df.select(df.columns[1:4]) - new_column_names = ['code', 'name', 'address'] + new_column_names = ["code", "name", "address"] df = df.rename({i: j for i, j in zip(df.columns, new_column_names)}) return df + def filter_null_names(df) -> pl.dataframe.frame.DataFrame: - return df.filter(pl.col('name').is_not_null()) + return df.filter(pl.col("name").is_not_null()) + def append_suffix(df) -> pl.dataframe.frame.DataFrame: return df.with_columns((pl.col(df.columns[0]) + " EK").alias(df.columns[0])) -def read_all_csv(pdf_directory='.') -> pl.dataframe.frame.DataFrame: + +def read_all_csv(pdf_directory=".") -> pl.dataframe.frame.DataFrame: dfs = [] for filename in os.listdir(pdf_directory): - if filename.endswith('.csv') and filename != output_file: + if filename.endswith(".csv") and filename != output_file: print(filename) try: df = read_csv_file(filename, pdf_directory) @@ -173,11 +204,17 @@ def read_all_csv(pdf_directory='.') -> pl.dataframe.frame.DataFrame: return result_df -output_file = 'CY-merge-UTF-8.csv' +output_file = "CY-merge-UTF-8.csv" df = read_all_csv() # rm duplicates -df = df.lazy().group_by('code').agg(pl.first('name'), pl.first('address')).sort('code').collect() - -df.write_csv(output_file, separator=';') +df = ( + df.lazy() + .group_by("code") + .agg(pl.first("name"), pl.first("address")) + .sort("code") + .collect() +) + +df.write_csv(output_file, separator=";") diff --git a/scripts/packager-codes/cz_packagers_refresh_part1.py b/scripts/packager-codes/cz_packagers_refresh_part1.py index 39a09c70d0938..ac869de3784be 100644 --- a/scripts/packager-codes/cz_packagers_refresh_part1.py +++ b/scripts/packager-codes/cz_packagers_refresh_part1.py @@ -1,7 +1,7 @@ ''' This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France Product Opener is free software: you can redistribute it and/or modify @@ -65,15 +65,17 @@ def read_input_file() -> pl.dataframe.frame.DataFrame: file_path = os.path.join(current_directory, filename) try: - df = pl.read_csv(filename) + df = pl.read_csv(filename) df = df.select(df.columns[0:3]) - + new_column_names = ['code', 'name', 'address'] - df = df.rename({i: j for i, j in zip(df.columns, new_column_names)}) - + df = df.rename( + {i: j for i, j in zip(df.columns, new_column_names)}) + # append suffix EK at the end of the packaging codes - df = df.with_columns((pl.col(df.columns[0]) + " ES").alias(df.columns[0])) + df = df.with_columns( + (pl.col(df.columns[0]) + " ES").alias(df.columns[0])) dfs.append(df) @@ -87,13 +89,13 @@ def read_input_file() -> pl.dataframe.frame.DataFrame: return result_df - if __name__ == "__main__": output_file = 'CZ-merge-UTF-8_no_coord.csv' df = read_input_file() # rm duplicates - df = df.lazy().group_by('code').agg(pl.first('name'), pl.first('address')).sort('code').collect() + df = df.lazy().group_by('code').agg(pl.first('name'), + pl.first('address')).sort('code').collect() df.write_csv(output_file, separator=';') diff --git a/scripts/packager-codes/cz_packagers_refresh_part2.py b/scripts/packager-codes/cz_packagers_refresh_part2.py index 90a429b79599b..833c1440702db 100644 --- a/scripts/packager-codes/cz_packagers_refresh_part2.py +++ b/scripts/packager-codes/cz_packagers_refresh_part2.py @@ -1,7 +1,7 @@ ''' This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France Product Opener is free software: you can redistribute it and/or modify @@ -37,12 +37,14 @@ def extract_address_components(address_to_convert): # can be postal code and town without comma # example: 281 63 Přehvozdí 62/2 if address_split[0][:5].isdigit(): - print("warning, address without street name (only postcode and town):", address_split) + print( + "warning, address without street name (only postcode and town):", address_split) post_and_town = address_split[0] postal_code = post_and_town.split()[0] town = " ".join(post_and_town.split()[1:]) elif address_split[0][:6].replace(" ", "").isdigit(): - print("warning, address without street name (only postcode and town):", address_split) + print( + "warning, address without street name (only postcode and town):", address_split) post_and_town = address_split[0] postal_code = " ".join(post_and_town.split()[:2]) town = " ".join(post_and_town.split()[2:]) @@ -59,13 +61,14 @@ def extract_address_components(address_to_convert): if w.lower() != w or found_title_word: updated_town.append(w) - # can be none + # can be none # example: parc.č. 164 - zemědělský areál if updated_town: town = updated_town elif len(address_split) == 2: - street, post_and_town = address_split[0].strip(), address_split[1].strip() + street, post_and_town = address_split[0].strip( + ), address_split[1].strip() # sometimes there are no postcode, just town if post_and_town[:5].isdigit(): postal_code = post_and_town.split()[0] @@ -74,7 +77,8 @@ def extract_address_components(address_to_convert): postal_code = post_and_town[:6] town = post_and_town[6:] else: - print("warning: could not extract postal code, set second element as town", address_split) + print( + "warning: could not extract postal code, set second element as town", address_split) town = post_and_town else: @@ -123,7 +127,6 @@ def extract_address_components(address_to_convert): else: print("error, town undefined, lat and lng will be search for the country only.") - print(f"street: {street}, postal_code: {postal_code}, town: {town}") return street, postal_code, town @@ -173,7 +176,7 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: pattern = r"(? hotecká 1538 1538 (previously) # hotecká 1538 1538 -> hotecká 1538 (hereafter) pattern = r"(\d+)\s*\1\b" @@ -197,9 +200,10 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: if street in url_2: url_3 = url_2.replace(f"street={street}&", "") else: - print(f'Empty response for before url_3: {address_to_convert}: {url_2}') + print(f'Empty response for before url_3: { + address_to_convert}: {url_2}') sys.exit(1) - + try: print("url_3", url_3) response = requests.get(url_3) @@ -207,7 +211,8 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: if data != []: lat, lng = data[0]['lat'], data[0]['lon'] else: - print(f'Empty response for: {address_to_convert}" {url_3}') + print(f'Empty response for: { + address_to_convert}" {url_3}') sys.exit(1) except (requests.exceptions.RequestException, KeyError, IndexError) as e: print(f"Error: {e}, url: {url}") @@ -222,13 +227,11 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: return [lat, lng] - if __name__ == "__main__": source_file = 'CZ-merge-UTF-8_no_coord.csv' target_file = "CZ-merge-UTF-8.csv" index_last_line_processed = 'cz_packagers_refresh_part2_index_tmp.txt' - api_key = "" # TODO remove - + api_key = "" # TODO remove data = [] try: @@ -256,7 +259,7 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: row += ['lat', 'lng'] else: row += convert_address_to_lat_lng(row[2]) - + writer.writerow(row) with open(index_last_line_processed, 'w') as f: diff --git a/scripts/packager-codes/dk_packagers_refresh_part1.py b/scripts/packager-codes/dk_packagers_refresh_part1.py index 0b932afd641ab..40e468ee9b9c8 100644 --- a/scripts/packager-codes/dk_packagers_refresh_part1.py +++ b/scripts/packager-codes/dk_packagers_refresh_part1.py @@ -1,7 +1,7 @@ ''' This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France Product Opener is free software: you can redistribute it and/or modify @@ -60,7 +60,8 @@ def read_input_file(file_name: str) -> pd.core.frame.DataFrame: # ignore two first tabs ids = [i for i in range(2, 21)] # skip first few rows (~page header) - excel_file = pd.read_excel('Autoriserede_Foedevarevirksomheder_Excel(1).xlsx', sheet_name=ids, skiprows=5) + excel_file = pd.read_excel( + 'Autoriserede_Foedevarevirksomheder_Excel(1).xlsx', sheet_name=ids, skiprows=5) # take only first three columns (code, name, address) filtered_dfs = [df.iloc[:, :3] for df in excel_file.values()] # combine all tabs into single one @@ -79,7 +80,7 @@ def read_input_file(file_name: str) -> pd.core.frame.DataFrame: # some approval number became float (60.0) df['code'] = df['code'].apply(lambda x: str(x).replace('.0', '')) - # append prefix DF and suffix EK + # append prefix DF and suffix EK # at the end of the packaging codes df['code'] = df['code'].str.strip() df['code'] = df['code'].apply(lambda x: f"DK {x} EF") @@ -90,7 +91,6 @@ def read_input_file(file_name: str) -> pd.core.frame.DataFrame: return df - if __name__ == "__main__": input_file = 'Autoriserede_Foedevarevirksomheder_Excel(1).xlsx' output_file = 'DK-merge-UTF-8_no_coord.csv' diff --git a/scripts/packager-codes/dk_packagers_refresh_part2.py b/scripts/packager-codes/dk_packagers_refresh_part2.py index bc9273f32803d..26c8fb8ed2830 100644 --- a/scripts/packager-codes/dk_packagers_refresh_part2.py +++ b/scripts/packager-codes/dk_packagers_refresh_part2.py @@ -1,7 +1,7 @@ ''' This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France Product Opener is free software: you can redistribute it and/or modify @@ -36,13 +36,15 @@ def extract_address_components(address_to_convert): print("info, address without comma") elif len(address_split) == 2: print("info, exactly 1 comma", address_split) - street, post_and_town = address_split[0].strip(), address_split[1].strip() + street, post_and_town = address_split[0].strip( + ), address_split[1].strip() # sometimes there are no postcode, just town if post_and_town[:4].isdigit(): postal_code = post_and_town.split()[0] town = " ".join(post_and_town.split()[1:]) else: - print("warning: could not extract postal code, set second element as town", address_split) + print( + "warning: could not extract postal code, set second element as town", address_split) town = post_and_town else: print("info, more than 1 comma", address_split) @@ -112,11 +114,10 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: if len(town_split[-1]) == 1 or len(town_split[-1]) == 2: town = " ".join(town_split[:-1]) print("info, drop suffix of town") - + if old_town != town: url = url.replace(old_town, town) - url_2 = url.replace(f"street={old_street}&", "") print("info, drop street") @@ -131,8 +132,9 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: # can be in Greenland # example: Fiskervej B 99, Postboks 69, 3921 Narsaq - url_3 = url_2.replace(f"country=Denmark&country_code=DK&", "") - + url_3 = url_2.replace( + f"country=Denmark&country_code=DK&", "") + try: print("url_3", url_3) response = requests.get(url_3) @@ -140,7 +142,8 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: if data != []: lat, lng = data[0]['lat'], data[0]['lon'] else: - print(f'Empty response for: {address_to_convert}" {url_3}') + print(f'Empty response for: { + address_to_convert}" {url_3}') sys.exit(1) except (requests.exceptions.RequestException, KeyError, IndexError) as e: print(f"Error: {e}, url: {url}") @@ -155,16 +158,14 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: return [lat, lng] - if __name__ == "__main__": source_file = 'DK-merge-UTF-8_no_coord.csv' target_file = "DK-merge-UTF-8.csv" index_last_line_processed = 'dk_packagers_refresh_part2_index_tmp.txt' - api_key = "" # TODO remove + api_key = "" # TODO remove country_name = "Denmark" country_code = "DK" - data = [] try: with open(index_last_line_processed, 'r') as f: @@ -190,7 +191,7 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: row += ['lat', 'lng'] else: row += convert_address_to_lat_lng(row[2]) - + writer.writerow(row) with open(index_last_line_processed, 'w') as f: diff --git a/scripts/packager-codes/hu_packagers_refresh_part1.py b/scripts/packager-codes/hu_packagers_refresh_part1.py index f5f317a25218b..4c6ff49c15a08 100644 --- a/scripts/packager-codes/hu_packagers_refresh_part1.py +++ b/scripts/packager-codes/hu_packagers_refresh_part1.py @@ -1,7 +1,7 @@ ''' This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France Product Opener is free software: you can redistribute it and/or modify @@ -67,13 +67,15 @@ import polars as pl import re + def handle_dates(text: str) -> str: # different variants # 2022.02.03 # 2022.02 # 2022-02-03 # 202202.03 - split_text = re.split(r"\d{4}[\.|\-]*\s*\d{2}[\.|\-](?:\d{2})*", text, maxsplit=1) + split_text = re.split( + r"\d{4}[\.|\-]*\s*\d{2}[\.|\-](?:\d{2})*", text, maxsplit=1) first_text = split_text[0] return first_text @@ -94,7 +96,7 @@ def read_input_file(file_name: str) -> pl.dataframe.frame.DataFrame: df = pl.read_csv(file_name, separator=',', truncate_ragged_lines=True) # parsing issue: - # address is found in next column (3), + # address is found in next column (3), # leaving address column (2) null df = df.with_columns( pl.when(pl.col(df.columns[2]).is_null()) @@ -121,15 +123,16 @@ def read_input_file(file_name: str) -> pl.dataframe.frame.DataFrame: df = df.with_columns((pl.col('code') + " ES").alias(df.columns[0])) # rm duplicates - df = df.lazy().group_by('code').agg(pl.first('name'), pl.first('address')).sort('code').collect() + df = df.lazy().group_by('code').agg(pl.first('name'), + pl.first('address')).sort('code').collect() # add lost record during conversion into csv # HU 13 TCS 003 ES;Magyar Agrár- és Élettudományi Egyetem Kaposvári Campus;7400 Kaposvár, Guba Sándor u. 40. / Somogy df = df.with_columns(pl.when(pl.col('code') == "HU 13 TCS 003 ES") - .then(pl.lit("Magyar Agrár- és Élettudományi Egyetem Kaposvári Campus")) - .otherwise(pl.col('name')) - .alias('name') - ) + .then(pl.lit("Magyar Agrár- és Élettudományi Egyetem Kaposvári Campus")) + .otherwise(pl.col('name')) + .alias('name') + ) # all others missing 'name' are strikethrough text df = df.filter(pl.col('name').is_not_null()) @@ -140,7 +143,8 @@ def read_input_file(file_name: str) -> pl.dataframe.frame.DataFrame: # sometimes dates inside text # last update is text before first date occurence - df = df.with_columns(pl.col('name').map_elements(lambda x: handle_dates(x), return_dtype=str)) + df = df.with_columns(pl.col('name').map_elements( + lambda x: handle_dates(x), return_dtype=str)) df = df.with_columns(pl.col('name').str.strip_chars()) df = df.with_columns(pl.col('name').str.replace_all(' ', ' ')) @@ -159,9 +163,10 @@ def read_input_file(file_name: str) -> pl.dataframe.frame.DataFrame: df = df.filter(pl.col('address').str.len_chars() > 6) # remove "/ " at the end of the address - # but not 10/A. + # but not 10/A. # smallest county is "Vas" - df = df.with_columns(pl.col('address').map_elements(lambda x: handle_county(x), return_dtype=str)) + df = df.with_columns(pl.col('address').map_elements( + lambda x: handle_county(x), return_dtype=str)) # remove "(text)" in middle of address # as well as "(begining of text" at end of the line df = df.with_columns( @@ -180,19 +185,18 @@ def read_input_file(file_name: str) -> pl.dataframe.frame.DataFrame: # keep 2 first parts of teach line # postal code city, street, additional information df = df.with_columns( - address=pl.col('address').str.split(",").list.slice(0, 2).list.join(",") + address=pl.col('address').str.split( + ",").list.slice(0, 2).list.join(",") ) return df - if __name__ == "__main__": input_file = 'enged_2024_05_22.csv' code_prefix = 'HU' output_file = f'{code_prefix}-merge-UTF-8_no_coord.csv' - df = read_input_file(input_file) df.write_csv(output_file, separator=';') diff --git a/scripts/packager-codes/hu_packagers_refresh_part2.py b/scripts/packager-codes/hu_packagers_refresh_part2.py index 01af863b99599..819d0a4033bfc 100644 --- a/scripts/packager-codes/hu_packagers_refresh_part2.py +++ b/scripts/packager-codes/hu_packagers_refresh_part2.py @@ -1,7 +1,7 @@ ''' This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France Product Opener is free software: you can redistribute it and/or modify @@ -40,12 +40,11 @@ def extract_address_components(address_to_convert): elif len(address_split) == 2: print("info, exactly 1 comma", address_split) # postcode and town are first - post_and_town, street = address_split[0].strip(), address_split[1].strip() + post_and_town, street = address_split[0].strip( + ), address_split[1].strip() else: print("info, more than 1 comma", address_split) - - if post_and_town: # type 626 Harta -> 6326 Harta if '626 Harta' in post_and_town: @@ -54,7 +53,7 @@ def extract_address_components(address_to_convert): postal_code = post_and_town[:4] town = post_and_town[5:] # sometimes spaces between all characters - # 2 0 2 4 K i s o r o s z i + # 2 0 2 4 K i s o r o s z i elif post_and_town.replace(' ', '')[:4].isdigit(): postal_code = post_and_town.replace(' ', '')[:4] town_no_space = post_and_town.replace(' ', '')[4:] @@ -65,7 +64,8 @@ def extract_address_components(address_to_convert): else: town += ch else: - print("warning, could not extract postal code, set whole element as town", post_and_town) + print( + "warning, could not extract postal code, set whole element as town", post_and_town) town = post_and_town print(f"street: {street}, postal_code: {postal_code}, town: {town}") @@ -113,7 +113,6 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: url_3 = url_2.replace(f"city={town}&", "") print("info, drop town") - try: print("url_3", url_3) response = requests.get(url_3) @@ -125,7 +124,8 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: # postalcode typo # 8927 Nemessándorháza -> 8925 Nemessándorháza - url_4 = url_3.replace(f"postalcode={postal_code}", f"city={town}") + url_4 = url_3.replace( + f"postalcode={postal_code}", f"city={town}") print("info, replace postalcode by town") try: @@ -135,7 +135,8 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: if data != []: lat, lng = data[0]['lat'], data[0]['lon'] else: - print(f'Empty response for: {address_to_convert}, {url_4}') + print(f'Empty response for: { + address_to_convert}, {url_4}') sys.exit(1) except (requests.exceptions.RequestException, KeyError, IndexError) as e: print(f"Error: {e}, url: {url}") @@ -153,14 +154,14 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: return [lat, lng] - if __name__ == "__main__": country_code = 'HU' country_name = 'Hungary' source_file = f'{country_code}-merge-UTF-8_no_coord.csv' target_file = f'{country_code}-merge-UTF-8.csv' - index_last_line_processed = f'{country_code.lower()}_packagers_refresh_part2_index_tmp.txt' - api_key = "" # TODO remove + index_last_line_processed = f'{ + country_code.lower()}_packagers_refresh_part2_index_tmp.txt' + api_key = "" # TODO remove data = [] try: @@ -187,7 +188,7 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: row += ['lat', 'lng'] else: row += convert_address_to_lat_lng(row[2]) - + writer.writerow(row) with open(index_last_line_processed, 'w') as f: diff --git a/scripts/packager-codes/ie_packagers_refresh_part1.py b/scripts/packager-codes/ie_packagers_refresh_part1.py index 028114fd9785f..2a6f653774873 100644 --- a/scripts/packager-codes/ie_packagers_refresh_part1.py +++ b/scripts/packager-codes/ie_packagers_refresh_part1.py @@ -1,7 +1,7 @@ ''' This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France Product Opener is free software: you can redistribute it and/or modify @@ -74,16 +74,17 @@ def extract_row_from_online_table(row: list) -> dict: return row_data + def parse_from_website(url: str) -> pl.dataframe.frame.DataFrame: try: html_content = requests.get(url, headers=headers).text - except requests.exceptions.ConnectionError : + except requests.exceptions.ConnectionError: print(f"parse_from_website, cannot get url {url}") if not html_content: print(f"parse_from_website, error with request {url}") sys.exit(1) - + soup = BeautifulSoup(html_content, 'html.parser') tables = soup.find_all('table') @@ -91,7 +92,8 @@ def parse_from_website(url: str) -> pl.dataframe.frame.DataFrame: data_rows = [] for table in tables: for tr in table.find_all('tr'): - raw_row_data = [td.get_text(separator=", ") for td in tr.find_all('td')] + raw_row_data = [td.get_text(separator=", ") + for td in tr.find_all('td')] # ignore [] # ignore ['Ovine'] or ['Cutting', 'Bovine'] (due to merged cells) # keep only first 4 columns @@ -134,7 +136,8 @@ def create_df_dafm_meat() -> pl.dataframe.frame.DataFrame: # keep only where integer in code column df = df.drop_nulls(subset=[df.columns[0]]) # add prefix and suffix - df = df.with_columns((f"{code_prefix} " + pl.col(df.columns[0]).cast(pl.String) + f" {code_suffix}").alias(df.columns[0])) + df = df.with_columns((f"{code_prefix} " + pl.col(df.columns[0]).cast( + pl.String) + f" {code_suffix}").alias(df.columns[0])) return df @@ -161,8 +164,8 @@ def create_df_dafm_milk() -> pl.dataframe.frame.DataFrame: pl.when(pl.col(df.columns[2]).cast(pl.String) == "as across") .then( pl.concat_str([ - pl.col(df.columns[1]) - ]) + pl.col(df.columns[1]) + ]) ) .otherwise( pl.concat_str([ @@ -175,13 +178,13 @@ def create_df_dafm_milk() -> pl.dataframe.frame.DataFrame: .alias(df.columns[1]) ) - df = df[:, [0, 1, 3]] # legal name: 1, trading name: 2 # 2 occurences of integer ending by space for code # IE2151EC (Ireland) starta by 2 spaces - df = df.with_columns(pl.col(df.columns[0]).str.replace_all('"', '').str.replace_all('\n\n', '')) + df = df.with_columns(pl.col(df.columns[0]).str.replace_all( + '"', '').str.replace_all('\n\n', '')) # replace non-integer by null for code # integer starting by 1 or starting by IE (Ireland) @@ -196,24 +199,27 @@ def create_df_dafm_milk() -> pl.dataframe.frame.DataFrame: ) # keep all row being integer - df_b = df.with_columns(pl.col(df.columns[0]).str.to_integer(strict=False).cast(pl.String)) + df_b = df.with_columns(pl.col(df.columns[0]).str.to_integer( + strict=False).cast(pl.String)) # keep only where integer in code column df_b = df_b.drop_nulls(subset=[df_b.columns[0]]) # add prefix and suffix - df_b = df_b.with_columns((f"{code_prefix} " + pl.col(df.columns[0]).cast(pl.String) + f" {code_suffix}").alias(df.columns[0])) + df_b = df_b.with_columns((f"{code_prefix} " + pl.col(df.columns[0]).cast( + pl.String) + f" {code_suffix}").alias(df.columns[0])) # missing comma in "3 Main St. Ballybunion, Co. Kerry" (Ireland) - df_b = df_b.with_columns(pl.col(df.columns[2]).str.replace("3 Main St. Ballybunion", "3 Main St., Ballybunion")) + df_b = df_b.with_columns(pl.col(df.columns[2]).str.replace( + "3 Main St. Ballybunion", "3 Main St., Ballybunion")) df = pl.concat([df_a, df_b]) df = df.with_columns( - pl.col(df.columns[2]) - .str.strip_chars() - .str.replace_all(r"\n", ", ") - .str.replace_all(r"\s+", " ") - .str.replace_all(r",,", ",") - .str.replace_all(r", ,", ",") - ) + pl.col(df.columns[2]) + .str.strip_chars() + .str.replace_all(r"\n", ", ") + .str.replace_all(r"\s+", " ") + .str.replace_all(r",,", ",") + .str.replace_all(r", ,", ",") + ) return df @@ -226,7 +232,6 @@ def get_data_online(): df_dafm_meat = create_df_dafm_meat() df_dafm_milk = create_df_dafm_milk() - # Health Service Executive (HSE) # no codes in url_hse_butcher # df_hse_butcher = parse_from_website(url_hse_butcher) @@ -238,14 +243,14 @@ def get_data_online(): df_sfpa_factory = parse_from_website(url_sfpa_factory) df = pl.concat([ - df_la_establishment, - df_dafm_meat, - df_dafm_milk, - df_hse_establishments, - df_sfpa_establishments, - df_sfpa_freezer, - df_sfpa_factory - ]) + df_la_establishment, + df_dafm_meat, + df_dafm_milk, + df_hse_establishments, + df_sfpa_establishments, + df_sfpa_freezer, + df_sfpa_factory + ]) df.write_csv(output_file, separator=';') diff --git a/scripts/packager-codes/ie_packagers_refresh_part2.py b/scripts/packager-codes/ie_packagers_refresh_part2.py index 25f20ff97fe97..20b2322fc2657 100644 --- a/scripts/packager-codes/ie_packagers_refresh_part2.py +++ b/scripts/packager-codes/ie_packagers_refresh_part2.py @@ -1,7 +1,7 @@ ''' This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France Product Opener is free software: you can redistribute it and/or modify @@ -25,12 +25,12 @@ import json -counties_list = ["antrim", "armagh", "carlow", "cavan", "clare", "cork", \ - "donegal", "down", "dublin", "fermanagh", "galway", \ - "kerry", "kildare", "kilkenny", "laois", "leitrim", \ - "limerick", "londonderry", "longford", "louth", "mayo", \ - "meath", "monaghan", "offaly", "roscommon", "sligo", \ - "tipperary", "tyrone", "waterford", "westmeath", \ +counties_list = ["antrim", "armagh", "carlow", "cavan", "clare", "cork", + "donegal", "down", "dublin", "fermanagh", "galway", + "kerry", "kildare", "kilkenny", "laois", "leitrim", + "limerick", "londonderry", "longford", "louth", "mayo", + "meath", "monaghan", "offaly", "roscommon", "sligo", + "tipperary", "tyrone", "waterford", "westmeath", "wexford", "wicklow"] @@ -46,7 +46,8 @@ def possible_county_check(possible_field: str) -> str: possible_field = possible_field[:match.start()] possible_field = possible_field.strip() - extracted_county = possible_field.lower().replace('co. ', '').replace('co.', '').replace('co ', '') + extracted_county = possible_field.lower().replace( + 'co. ', '').replace('co.', '').replace('co ', '') if extracted_county in counties_list: return extracted_county.title() else: @@ -57,8 +58,9 @@ def possible_postcode_check(field: str) -> bool: # examples: D24 NY84, X35 Y670, P51A525, D22 E6P4 field = field.replace(' ', '') if len(field) < 7: - return False - postcode_bool = all([field[0].isalpha(), field[1:3].isdigit(), field[3].isalpha(), field[4:6].isalnum(), field[6:].isdigit()]) + return False + postcode_bool = all([field[0].isalpha(), field[1:3].isdigit( + ), field[3].isalpha(), field[4:6].isalnum(), field[6:].isdigit()]) return postcode_bool @@ -86,7 +88,8 @@ def missing_comma_check(field: str) -> str: print(f"missing_comma_check, split: {decomposed}") updated_field = [] for i in decomposed: - print(f"missing_comma_check, loop element: {i}, updated field: {updated_field}") + print(f"missing_comma_check, loop element: { + i}, updated field: {updated_field}") possible_county = possible_county_check(i) if possible_county: missing_comma_check_update_county(possible_county, updated_field) @@ -101,13 +104,13 @@ def missing_comma_check(field: str) -> str: continue updated_field.append(i) - + print(f"missing_comma_check, before join: {updated_field}") updated_field = " ".join(updated_field) print(f"missing_comma_check, after join: {updated_field}") updated_field = updated_field.strip(',') print(f"missing_comma_check, after strip: {updated_field}") - + return updated_field @@ -152,16 +155,16 @@ def extract_address_components_two_commas(address_components: list) -> tuple: def extract_address_components_more_than_two_commas(address_components: list) -> tuple: - # start from the end and + # start from the end and # assign county if found, # ignore additional county if found, # ignore postcode if found, # ignore 'Ireland' if found - + street = '' city = '' county = '' - + for i in range(1, len(address_components)+1): possible_county = possible_county_check(address_components[-i]) possible_postcode = possible_postcode_check(address_components[-i]) @@ -172,14 +175,15 @@ def extract_address_components_more_than_two_commas(address_components: list) -> # example: Quinlan Steele, Milleens Cheese ltd., Eyeries, Beara, Co.Cork, Ireland, P75 FN52 if address_components[-i].lower() == 'ireland': continue - + if not possible_county and not possible_postcode: if not city: city = address_components[-i] elif not street: street = address_components[-i] else: - print(f"info, extract_address_components_more_than_two_commas, already extracted everything, ignore: {address_components[-i]}") + print(f"info, extract_address_components_more_than_two_commas, already extracted everything, ignore: { + address_components[-i]}") return street, city, county @@ -187,12 +191,16 @@ def extract_address_components_more_than_two_commas(address_components: list) -> def extract_address_components(address_to_convert): address_split = address_to_convert.split(',') # handle cases like Co Cork P85AT89 -> Co Cork, P85AT89 (Ireland) - address_split_with_sublist = [missing_comma_check(i).split(',') for i in address_split] - address_split = [item for sublist in address_split_with_sublist for item in sublist] - print(f"extract_address_components, address_split after missing comma check: {address_split}") + address_split_with_sublist = [missing_comma_check( + i).split(',') for i in address_split] + address_split = [ + item for sublist in address_split_with_sublist for item in sublist] + print(f"extract_address_components, address_split after missing comma check: { + address_split}") address_split = [x.strip() for x in address_split] - print(f"extract_address_components, address_split after strip elements: {address_split}") + print(f"extract_address_components, address_split after strip elements: { + address_split}") street = '' city = '' @@ -206,13 +214,16 @@ def extract_address_components(address_to_convert): city = address_split[0] elif len(address_split) == 2: print("info, extract_address_components, exactly 1 comma") - street, city, county = extract_address_components_one_comma(address_split) + street, city, county = extract_address_components_one_comma( + address_split) elif len(address_split) == 3: print("info, extract_address_components, contains 2 commas") - street, city, county = extract_address_components_two_commas(address_split) + street, city, county = extract_address_components_two_commas( + address_split) else: print("info, extract_address_components, more than 2 commas") - street, city, county = extract_address_components_more_than_two_commas(address_split) + street, city, county = extract_address_components_more_than_two_commas( + address_split) print(f"street: {street}, city: {city}, county: {county}") return street, city, county @@ -248,7 +259,6 @@ def cached_get(url: str, cache) -> list: else: restart = False - # Store the JSON response in the cache cache[url] = json.dumps(data) @@ -260,7 +270,7 @@ def no_results_update_query(url: str, i: int) -> str: if i == 1: url = re.sub("street=[^&]*&", "", url) print(f"no_results_update_query, remove street: {url}") - elif i ==2: + elif i == 2: url = re.sub("city=[^&]*&", "", url) print(f"no_results_update_query, remove city: {url}") else: @@ -289,13 +299,13 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: iter_failures = 0 while failed: with dbm.open('cache', 'c') as cache: - data = cached_get(url, cache) - if data != []: - lat, lng = [data[0]['lat'], data[0]['lon']] - failed = False - else: - iter_failures += 1 - url = no_results_update_query(url, iter_failures) + data = cached_get(url, cache) + if data != []: + lat, lng = [data[0]['lat'], data[0]['lon']] + failed = False + else: + iter_failures += 1 + url = no_results_update_query(url, iter_failures) return [lat, lng] @@ -305,7 +315,8 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: country_name = 'Ireland' source_file = f'{country_code}-merge-UTF-8_no_coord.csv' target_file = f'{country_code}-merge-UTF-8.csv' - index_last_line_processed = f'{country_code.lower()}_packagers_refresh_part2_index_tmp.txt' + index_last_line_processed = f'{ + country_code.lower()}_packagers_refresh_part2_index_tmp.txt' # use user agent for requests headers = {'User-Agent': 'packager-openfoodfacts'} @@ -335,7 +346,7 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: row += ['lat', 'lng'] else: row += convert_address_to_lat_lng(row[2]) - + writer.writerow(row) with open(index_last_line_processed, 'w') as f: diff --git a/scripts/packager-codes/it_packagers_refresh.py b/scripts/packager-codes/it_packagers_refresh.py index 73790de076ed2..09fa38805c752 100644 --- a/scripts/packager-codes/it_packagers_refresh.py +++ b/scripts/packager-codes/it_packagers_refresh.py @@ -1,7 +1,7 @@ ''' This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France Product Opener is free software: you can redistribute it and/or modify @@ -52,12 +52,13 @@ import requests_cache requests_cache.install_cache('temp', use_cache_dir=True) except ImportError as e: - pass # not caching + pass # not caching def clean_str(expr): return expr.str.strip_chars().str.strip_chars(', ').str.replace_all(r'\s+,', ',').str.replace_all(r'\s+', ' ') + def concat_in_group(expr): return expr.map_batches(lambda l: l.list.unique().list.sort().list.join('|'), agg_list=True, returns_scalar=True).replace('', None) @@ -69,7 +70,8 @@ def get_alimenti(csv): # headers SPOA # precedente_bollo_cee;num_identificativo_produzione_commercializzazione;ragione_sociale;indirizzo;comune;provincia;codice_regione;regione;classificazione_stabilimento;codice_impianto_attivita;descrizione_impianto_attivita;prodotti_abilitati;specifica_prodotti_abilitati;paesi_export_autorizzato;longitudine;latitudine;stato_localizzazione;cod_fiscale;p_iva;codice_comune;stato_attivita;data_ultimo_aggiornamento f = io.BytesIO(csv_file.content) - df = pl.read_csv(f, separator=';', schema_overrides={'longitudine': str, 'latitudine': str}) + df = pl.read_csv(f, separator=';', schema_overrides={ + 'longitudine': str, 'latitudine': str}) df = df.rename({ 'num_identificativo_produzione_commercializzazione': 'codice', 'p_iva': 'vat', @@ -84,14 +86,19 @@ def get_alimenti(csv): 'classificazione_stabilimento': 'class', 'codice_impianto_attivita': 'plant', }).with_columns( - pl.col('codice').str.replace(r'^UE IT\s+(.+)$', 'IT ${1} CE').replace('UE IT ', None).replace('ABP ', None).alias('code'), + pl.col('codice').str.replace(r'^UE IT\s+(.+)$', + 'IT ${1} CE').replace('UE IT ', None).replace('ABP ', None).alias('code'), clean_str(pl.col('address')), clean_str(pl.col('name')), clean_str(pl.col('vat').replace('-', None).replace('XXXXXXX', None)), clean_str(pl.col('fiscal_code').replace('-', None)), clean_str(pl.col('paesi_export_autorizzato').replace('-', None)), - pl.col('lat').str.replace(r'(\d+\.\d+)\.(\d+)', '${1}${2}').str.replace(r'(\d+\.\d+)\.(\d+)', '${1}${2}'), # some lat/lon have an extra dot - pl.col('lon').str.replace(r'(\d+\.\d+)\.(\d+)', '${1}${2}').str.replace(r'(\d+\.\d+)\.(\d+)', '${1}${2}'), # some lat/lon have an extra dot + pl.col('lat').str.replace(r'(\d+\.\d+)\.(\d+)', '${1}${2}').str.replace( + # some lat/lon have an extra dot + r'(\d+\.\d+)\.(\d+)', '${1}${2}'), + pl.col('lon').str.replace(r'(\d+\.\d+)\.(\d+)', '${1}${2}').str.replace( + # some lat/lon have an extra dot + r'(\d+\.\d+)\.(\d+)', '${1}${2}'), ).sort('code', 'vat', 'fiscal_code') df_uq = df.group_by( @@ -106,12 +113,13 @@ def get_alimenti(csv): return df_uq # df_uq.write_csv(str(output_file), separator=';') - + if __name__ == "__main__": code_prefix = 'IT' code_suffix = 'CE' output_file = f'{code_prefix}-merge-UTF-8.csv' - output_file = Path(__file__).parent.parent.parent / 'packager-codes' / output_file + output_file = Path(__file__).parent.parent.parent / \ + 'packager-codes' / output_file output_file = output_file.resolve() # use user agent for requests headers = {'User-Agent': 'packager-openfoodfacts'} @@ -119,14 +127,17 @@ def get_alimenti(csv): session.headers = headers # TODO get latest csv urls from web permalinks - alimenti_web_permalink = 'https://www.dati.salute.gov.it/dataset/stabilimenti_italiani_reg_CE_853_2004.jsp' # '.container a[href*=".csv"]:has(> span)' - sottoprodotti_web_permalink = 'https://www.dati.salute.gov.it/dataset/stabilimenti_italiani_reg_CE_1069_2009.jsp' # '.container a[href*=".csv"]:has(> span)' + # '.container a[href*=".csv"]:has(> span)' + alimenti_web_permalink = 'https://www.dati.salute.gov.it/dataset/stabilimenti_italiani_reg_CE_853_2004.jsp' + # '.container a[href*=".csv"]:has(> span)' + sottoprodotti_web_permalink = 'https://www.dati.salute.gov.it/dataset/stabilimenti_italiani_reg_CE_1069_2009.jsp' alimenti_csv = 'https://www.dati.salute.gov.it/sites/default/files/opendata/STAB_POA_8_20241030.csv' sottoprodotti_csv = 'https://www.dati.salute.gov.it/sites/default/files/opendata/STAB_SPOA_9_20241030.csv' alimenti_df = get_alimenti(alimenti_csv) sottoprodotti_df = get_alimenti(sottoprodotti_csv) - df_merged = pl.concat([alimenti_df, sottoprodotti_df]).sort('code', 'vat', 'fiscal_code') + df_merged = pl.concat([alimenti_df, sottoprodotti_df]).sort( + 'code', 'vat', 'fiscal_code') df_merged.write_csv(str(output_file), separator=';') diff --git a/scripts/packager-codes/lu_packagers_refresh_part1.py b/scripts/packager-codes/lu_packagers_refresh_part1.py index 4db88f62f1a3c..fa0a89f7bdcdc 100644 --- a/scripts/packager-codes/lu_packagers_refresh_part1.py +++ b/scripts/packager-codes/lu_packagers_refresh_part1.py @@ -1,7 +1,7 @@ ''' This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France Product Opener is free software: you can redistribute it and/or modify @@ -61,10 +61,12 @@ def extract_row_from_online_table(row: list) -> dict: row_data = {} # start by code_prefix (and end by code_suffix) if not row[0].strip().startswith(f'{code_prefix} '): - approval_number = f"{code_prefix} {row[0].replace('-', '').strip()} {code_suffix}" + approval_number = f"{code_prefix} { + row[0].replace('-', '').strip()} {code_suffix}" else: approval_number = row[0].strip() - address = row[2].replace('
', ', ').strip() + ", " + row[3] + ", " + row[4] + address = row[2].replace('
', ', ').strip() + \ + ", " + row[3] + ", " + row[4] address = address.replace(' ', '') address = address.replace(',,', ',') address = address.replace(', ,', ',') @@ -91,13 +93,13 @@ def contains_number_check(value: str) -> bool: def parse_from_website(url: str) -> pl.dataframe.frame.DataFrame: try: html_content = requests.get(url, headers=headers).text - except requests.exceptions.ConnectionError : + except requests.exceptions.ConnectionError: print(f"parse_from_website, cannot get url {url}") if not html_content: print(f"parse_from_website, error with request {url}") sys.exit(1) - + soup = BeautifulSoup(html_content, 'html.parser') tables = soup.find_all('table') @@ -105,9 +107,10 @@ def parse_from_website(url: str) -> pl.dataframe.frame.DataFrame: data_rows = [] for table in tables: for tr in table.find_all('tr'): - raw_row_data = [td.get_text(separator=", ") for td in tr.find_all('td')] + raw_row_data = [td.get_text(separator=", ") + for td in tr.find_all('td')] print(f"parse_from_website, raw_row_data: {raw_row_data}") - + contains_number = contains_number_check(raw_row_data[0]) # ignore [] diff --git a/scripts/packager-codes/lu_packagers_refresh_part2.py b/scripts/packager-codes/lu_packagers_refresh_part2.py index 1fdc367c0627e..b6e7056c473c3 100644 --- a/scripts/packager-codes/lu_packagers_refresh_part2.py +++ b/scripts/packager-codes/lu_packagers_refresh_part2.py @@ -1,7 +1,7 @@ ''' This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France Product Opener is free software: you can redistribute it and/or modify @@ -36,8 +36,9 @@ def extract_address_components(address_to_convert): address_split = address_to_convert.split(',') address_split = [x.strip() for x in address_split] - street, postalcode, city = ", ".join(address_split[:-2]), address_split[-2], address_split[-1] - + street, postalcode, city = ", ".join( + address_split[:-2]), address_split[-2], address_split[-1] + return street, postalcode, city @@ -71,7 +72,6 @@ def cached_get(url: str, cache) -> list: else: restart = False - # Store the JSON response in the cache cache[url] = json.dumps(data) @@ -117,13 +117,13 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: iter_failures = 0 while failed: with dbm.open('cache', 'c') as cache: - data = cached_get(url, cache) - if data != []: - lat, lng = [data[0]['lat'], data[0]['lon']] - failed = False - else: - iter_failures += 1 - url = no_results_update_query(url, iter_failures) + data = cached_get(url, cache) + if data != []: + lat, lng = [data[0]['lat'], data[0]['lon']] + failed = False + else: + iter_failures += 1 + url = no_results_update_query(url, iter_failures) return [lat, lng] @@ -133,7 +133,8 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: country_name = 'Luxembourg' source_file = f'{country_code}-merge-UTF-8_no_coord.csv' target_file = f'{country_code}-merge-UTF-8.csv' - index_last_line_processed = f'{country_code.lower()}_packagers_refresh_part2_index_tmp.txt' + index_last_line_processed = f'{ + country_code.lower()}_packagers_refresh_part2_index_tmp.txt' # use user agent for requests headers = {'User-Agent': 'packager-openfoodfacts'} @@ -163,7 +164,7 @@ def convert_address_to_lat_lng(address_to_convert: str) -> list: row += ['lat', 'lng'] else: row += convert_address_to_lat_lng(row[2]) - + writer.writerow(row) with open(index_last_line_processed, 'w') as f: diff --git a/scripts/packager-codes/non-eu/non_eu_spider.py b/scripts/packager-codes/non-eu/non_eu_spider.py index 4de82926564d7..6fd4e0343fd80 100644 --- a/scripts/packager-codes/non-eu/non_eu_spider.py +++ b/scripts/packager-codes/non-eu/non_eu_spider.py @@ -9,7 +9,8 @@ def get_one(values: list) -> Any: if len(values) != 1: - raise ValueError("values list length must be equal to 1: {}".format(values)) + raise ValueError( + "values list length must be equal to 1: {}".format(values)) return values[0] @@ -36,19 +37,23 @@ class NonEuSpider(scrapy.Spider): def parse(self, response): for country_cell in response.xpath("//ul[@class='country-list']/li"): - country_name = country_cell.xpath("a[@class='country-name']/text()").get() + country_name = country_cell.xpath( + "a[@class='country-name']/text()").get() for section_table in country_cell.xpath("ul"): - section = section_table.xpath("preceding-sibling::h3[1]/text()").get() + section = section_table.xpath( + "preceding-sibling::h3[1]/text()").get() for doc_link in section_table.xpath("li/a"): file_path = doc_link.xpath("@href").get() - doc_loader = ItemLoader(item=NonEuDocumentItem(), selector=doc_link) + doc_loader = ItemLoader( + item=NonEuDocumentItem(), selector=doc_link) doc_loader.add_value("country_name", country_name) doc_loader.add_value("section", section) doc_loader.add_xpath("title", "text()") doc_loader.add_xpath("publication_date", "span/text()") doc_loader.add_value("file_path", file_path) - doc_loader.add_value("url", urljoin(response.url, file_path)) + doc_loader.add_value( + "url", urljoin(response.url, file_path)) yield doc_loader.load_item() diff --git a/scripts/packager-codes/non-eu/packager_codes.py b/scripts/packager-codes/non-eu/packager_codes.py index 587c868d55f9a..80a483e489ea2 100644 --- a/scripts/packager-codes/non-eu/packager_codes.py +++ b/scripts/packager-codes/non-eu/packager_codes.py @@ -26,19 +26,22 @@ def scrape_document_info() -> List[JSONObject]: country_name, title, url, publication_date, file_path, section. """ logger.info("Scraping remote document information") - cmd = "scrapy runspider --output - --output-format json --loglevel WARN".split(" ") + cmd = "scrapy runspider --output - --output-format json --loglevel WARN".split( + " ") cmd.append(str(SCRAPY_SPIDER_FILE_PATH)) cmd_res = subprocess.run(cmd, stdout=subprocess.PIPE, check=True) return json.loads(cmd_res.stdout.decode()) def download_documents(document_info: Sequence[JSONObject], dest_dir: Path) -> None: - logger.info("Downloading %s documents into '%s'", len(document_info), dest_dir) + logger.info("Downloading %s documents into '%s'", + len(document_info), dest_dir) dest_dir = Path(dest_dir) for i, doc_info in enumerate(document_info): dest_path = dest_dir / doc_info["file_path"] logger.info( - "(%s/%s) Downloading %s", i + 1, len(document_info), doc_info["url"] + "(%s/%s) Downloading %s", i + + 1, len(document_info), doc_info["url"] ) dest_path.parent.mkdir(parents=True, exist_ok=True) with urlopen(doc_info["url"]) as response, dest_path.open("wb") as dest_file: @@ -62,7 +65,8 @@ def document_info_diff( ) ] unchanged_names = ( - set(local_docs.keys()).difference(removed_names).difference(updated_names) + set(local_docs.keys()).difference( + removed_names).difference(updated_names) ) return { diff --git a/scripts/packager-codes/non-eu/pdf_extraction.py b/scripts/packager-codes/non-eu/pdf_extraction.py index 81343cdcafba4..00688c0813735 100644 --- a/scripts/packager-codes/non-eu/pdf_extraction.py +++ b/scripts/packager-codes/non-eu/pdf_extraction.py @@ -21,10 +21,12 @@ def extract_page(page: Page): table_page = page.crop(table_bbox) # Get table lines - vertical_lines_x = sorted(set(p[0] for p in table_page.curves[0]["points"])) + vertical_lines_x = sorted(set(p[0] + for p in table_page.curves[0]["points"])) vertical_lines_x = [table_page.curves[1]["x0"]] + vertical_lines_x + [ table_page.curves[1]["x1"]] - horizontal_lines_y = sorted(c["points"][0][1] for c in table_page.curves[1:]) + horizontal_lines_y = sorted(c["points"][0][1] + for c in table_page.curves[1:]) horizontal_lines_y = [table_page.curves[0]["top"]] + horizontal_lines_y + [ table_page.curves[-1]["bottom"]] diff --git a/scripts/packager-codes/poland_packager_code.py b/scripts/packager-codes/poland_packager_code.py index 0286ca1f78e61..efc4ae71d6233 100644 --- a/scripts/packager-codes/poland_packager_code.py +++ b/scripts/packager-codes/poland_packager_code.py @@ -4,10 +4,12 @@ import geocoder from bs4 import BeautifulSoup + def make_request(url): r = requests.get(url) return r.text + path_csv_file = './results.csv' with open(path_csv_file, mode="w") as csv_file: csv_writer = csv.writer(csv_file, delimiter=";") @@ -16,7 +18,7 @@ def make_request(url): base_url = 'https://pasze.wetgiw.gov.pl/spi/demozatw/index.php?kodwoj=&kodpow=&szukanaNazwa=&szukanaMiejsc=&szukanyWni=&onpage=20&poprzedniaSekcja=1&gatunek=&kategoria=' # Total of 18 categories so we iterate over it - for i in range (1, 18): + for i in range(1, 18): url = base_url + '&sekcja=' + str(i) results = make_request(url) # Get count number and then make requests for all the pages diff --git a/scripts/packager-codes/portugal-concatenate-csv-sections.py b/scripts/packager-codes/portugal-concatenate-csv-sections.py index 13f3a2d4eb2a5..79028e6724bc1 100644 --- a/scripts/packager-codes/portugal-concatenate-csv-sections.py +++ b/scripts/packager-codes/portugal-concatenate-csv-sections.py @@ -1,38 +1,43 @@ -## This program allows you to create a new file with all merged csv. -## If there is a preexisting file it will append to it (rewritting the column labels once) -print ("Program starting") -import csv +# This program allows you to create a new file with all merged csv. +# If there is a preexisting file it will append to it (rewritting the column labels once) import os -### Lines with 3 ### might need adaptation to your need -MAIN_CSV = 'Portugal_concatenated-UTF.csv' ### PATH of the csv where you will agregate everything -folder_path ='sources' ### SET to your folder with all csv +import csv +print("Program starting") +# Lines with 3 ### might need adaptation to your need +# PATH of the csv where you will agregate everything +MAIN_CSV = 'Portugal_concatenated-UTF.csv' +folder_path = 'sources' # SET to your folder with all csv -is_header = os.path.isfile(MAIN_CSV) #Just check if we are creating the file or not +# Just check if we are creating the file or not +is_header = os.path.isfile(MAIN_CSV) -main_csv_file = open(MAIN_CSV, 'a', newline='',encoding = "UTF-8") # 'a' to append and not overwrite -main_csv_writer = csv.writer(main_csv_file, delimiter=';',quotechar='|') ### DELIMITER to choose wisely -j=0 -for file in os.listdir(folder_path): #loop over all files - if not file.startswith('.'): #avoid any hidden file, useful on mac - print ('handling file '+str(j)+' : '+file) - with open(folder_path+'/'+file, newline='',encoding = "windows-1252") as csvfile: ###Adapt encoding to your country source encoding - csvreader = csv.reader(csvfile, delimiter=';',quotechar='|') - i=0 - if not is_header: ### add column labels if the file doesn't preexist - is_header=True - header = next(csvreader) - main_csv_writer.writerow(header) - i+=1 ### consequence of using next() which moves row 1 position forward - for row in csvreader: - #print ('row = ', i) - #print (row) - if i!=0: ### to avoid the first line with all column labels on every file - row[0]=row[0][2:][:-2] ### removes the \"= and \" from the first column ID (needed in Portugal csv file) - main_csv_writer.writerow(row) - i+=1 - print (str(i)+' rows have been loaded') - j+=1 +# 'a' to append and not overwrite +main_csv_file = open(MAIN_CSV, 'a', newline='', encoding="UTF-8") +# DELIMITER to choose wisely +main_csv_writer = csv.writer(main_csv_file, delimiter=';', quotechar='|') +j = 0 +for file in os.listdir(folder_path): # loop over all files + if not file.startswith('.'): # avoid any hidden file, useful on mac + print('handling file '+str(j)+' : '+file) + # Adapt encoding to your country source encoding + with open(folder_path+'/'+file, newline='', encoding="windows-1252") as csvfile: + csvreader = csv.reader(csvfile, delimiter=';', quotechar='|') + i = 0 + if not is_header: # add column labels if the file doesn't preexist + is_header = True + header = next(csvreader) + main_csv_writer.writerow(header) + i += 1 # consequence of using next() which moves row 1 position forward + for row in csvreader: + # print ('row = ', i) + # print (row) + if i != 0: # to avoid the first line with all column labels on every file + # removes the \"= and \" from the first column ID (needed in Portugal csv file) + row[0] = row[0][2:][:-2] + main_csv_writer.writerow(row) + i += 1 + print(str(i)+' rows have been loaded') + j += 1 main_csv_file.close() print("") -print ("Process complete, results in : "+MAIN_CSV) - +print("Process complete, results in : "+MAIN_CSV) diff --git a/scripts/packager-codes/si-packagers-refresh.py b/scripts/packager-codes/si-packagers-refresh.py index 5861f04060754..dd4761983cb9b 100644 --- a/scripts/packager-codes/si-packagers-refresh.py +++ b/scripts/packager-codes/si-packagers-refresh.py @@ -2,7 +2,7 @@ This file is part of Product Opener. Product Opener -Copyright (C) 2011-2023 Association Open Food Facts +Copyright (C) 2011-2024 Association Open Food Facts Contact: contact@openfoodfacts.org Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France @@ -65,190 +65,200 @@ import json - file_name = "slovenian_packaging_raw.csv" -api_key = "" # TODO remove +api_key = "" # TODO remove output_file_name = 'SI-merge-UTF-8.csv' + def clean_code(input_code: str) -> str: - # remove double spaces - input_code = input_code.replace(' ', ' ') + # remove double spaces + input_code = input_code.replace(' ', ' ') - # SI H-1015 ES - if input_code.endswith('ES'): - input_code = input_code.replace('ES', '').strip() + # SI H-1015 ES + if input_code.endswith('ES'): + input_code = input_code.replace('ES', '').strip() - # SI M-1035 SI - if input_code.endswith('SI'): - input_code = re.sub(r"\b(SI|ES)$", "", input_code).strip() + # SI M-1035 SI + if input_code.endswith('SI'): + input_code = re.sub(r"\b(SI|ES)$", "", input_code).strip() - # SI H-731, SI 731 - if ',' in input_code: - input_code = "".join(input_code.split(', ')[1]) + # SI H-731, SI 731 + if ',' in input_code: + input_code = "".join(input_code.split(', ')[1]) - # SI H - 728, SI H 728, SI H-728, also with M - input_code = input_code.replace('H - ', 'H-') - input_code = input_code.replace('H ', 'H-') - input_code = input_code.replace('M - ', 'M-') - input_code = input_code.replace('M ', 'M-') + # SI H - 728, SI H 728, SI H-728, also with M + input_code = input_code.replace('H - ', 'H-') + input_code = input_code.replace('H ', 'H-') + input_code = input_code.replace('M - ', 'M-') + input_code = input_code.replace('M ', 'M-') - # SI - 907 -> SI 907 - input_code = input_code.replace(' - ', ' ') - input_code = input_code.replace(' -', ' ') - input_code = input_code.replace('SI-', 'SI ') + # SI - 907 -> SI 907 + input_code = input_code.replace(' - ', ' ') + input_code = input_code.replace(' -', ' ') + input_code = input_code.replace('SI-', 'SI ') - # SI1194 - if 'SI ' not in input_code: - input_code = input_code.replace('SI', 'SI ') - # SI M1106 - if 'M-' not in input_code: - input_code = input_code.replace('M', 'M-') + # SI1194 + if 'SI ' not in input_code: + input_code = input_code.replace('SI', 'SI ') + # SI M1106 + if 'M-' not in input_code: + input_code = input_code.replace('M', 'M-') - return input_code + return input_code def clean_address(input_address: str) -> str: - # special character because - # sometimes new line between 2 addreses - # sometimes line for single address split in 2 - input_address = "<>".join(input_address.split('\r')) + # special character because + # sometimes new line between 2 addreses + # sometimes line for single address split in 2 + input_address = "<>".join(input_address.split('\r')) + + # fetch last occurence + # words 123A, place, 4567 city name + # Á found in a city name (PROSENJAKOVCI -PÁRTOSFALVA) + pattern = r'(([a-zčćžđšA-ZČĆŽĐŠŽ\s.-]+\d+[ABCDEFGIJ]?),(?:[a-zčćžđšA-ZČĆŽĐŠŽ\s\<\>.-]+,\s*)?[\<\>]*(\s*\d{4}[a-zčćžđšA-ZČĆŽĐŠŽÁ\s\<\>.-]+)$)' + # SI M-316 - should be Fužinska Ulica 1, 4220 Škofja Loka - not Kidričeva Cesta 63A, 4220 Škofja Loka + + match = re.search(pattern, input_address) + + if match: + output_address = (f"{match.group(2).strip().title()}, { + match.group(3).replace('<>', ' ').strip().title()}") + else: + # MOŠNJE , MOŠNJE, 4240 RADOVLJICA -> no street number (also DIJAŠKA ULICA , 5220 TOLMIN) + # instead, fetch "something, postal_code city" + pattern_2 = r'(([a-zčćžđšA-ZČĆŽĐŠŽ\s\-\.]+),(\s*\d{4})([a-zčćžđšA-ZČĆŽĐŠŽÁ\s\-\.\<\>]+)$)' + match_2 = re.search(pattern_2, input_address) + if match_2: + output_address = (f"{match_2.group(2).strip().title()}, { + match_2.group(3).replace('<>', ' ').strip().title()}") + else: + print("Match problem", input_address) + output_address = input_address + + return output_address - # fetch last occurence - # words 123A, place, 4567 city name - # Á found in a city name (PROSENJAKOVCI -PÁRTOSFALVA) - pattern = r'(([a-zčćžđšA-ZČĆŽĐŠŽ\s.-]+\d+[ABCDEFGIJ]?),(?:[a-zčćžđšA-ZČĆŽĐŠŽ\s\<\>.-]+,\s*)?[\<\>]*(\s*\d{4}[a-zčćžđšA-ZČĆŽĐŠŽÁ\s\<\>.-]+)$)' - # SI M-316 - should be Fužinska Ulica 1, 4220 Škofja Loka - not Kidričeva Cesta 63A, 4220 Škofja Loka +def cached_get(url: str, cache) -> list: + # Check if the URL is already in the cache + if url in cache: + # If yes, return the cached response + print(" from cache") + return json.loads(cache[url]) + + # restart 3 times in case of empty response to make sure it is not an issue in API-side + restart = True + i = 0 + while restart: + # If not, make the HTTP request + try: + response = requests.get(url) + except (requests.exceptions.RequestException, KeyError, IndexError) as e: + return [] + data = response.json() + if data == [] and i < 3: + i += 1 + print(" restart ", i) + sleep(1) + else: + restart = False + + # Store the JSON response in the cache + cache[url] = json.dumps(data) + + return data - match = re.search(pattern, input_address) - - if match: - output_address = (f"{match.group(2).strip().title()}, {match.group(3).replace('<>', ' ').strip().title()}") - else: - # MOŠNJE , MOŠNJE, 4240 RADOVLJICA -> no street number (also DIJAŠKA ULICA , 5220 TOLMIN) - # instead, fetch "something, postal_code city" - pattern_2 = r'(([a-zčćžđšA-ZČĆŽĐŠŽ\s\-\.]+),(\s*\d{4})([a-zčćžđšA-ZČĆŽĐŠŽÁ\s\-\.\<\>]+)$)' - match_2 = re.search(pattern_2, input_address) - if match_2: - output_address = (f"{match_2.group(2).strip().title()}, {match_2.group(3).replace('<>', ' ').strip().title()}") - else: - print("Match problem", input_address) - output_address = input_address +def convert_address_to_lat_lng(address_to_convert: str) -> str: + # free plan: 1 request per second + sleep(1) - return output_address + print("address_to_convert: ", address_to_convert) + street, post_and_town = address_to_convert.split(',') + postalcode = post_and_town.strip().split()[0] + town = " ".join(post_and_town.strip().split()[1:]) + url = f"https://geocode.maps.co/search?street={street}&town={town}&postalcode={ + postalcode}&country=Slovenia&country_code=si&api_key={api_key}" -def cached_get(url: str, cache) -> list: - # Check if the URL is already in the cache - if url in cache: - # If yes, return the cached response - print(" from cache") - return json.loads(cache[url]) - - # restart 3 times in case of empty response to make sure it is not an issue in API-side - restart = True - i = 0 - while restart: - # If not, make the HTTP request - try: - response = requests.get(url) - except (requests.exceptions.RequestException, KeyError, IndexError) as e: - return [] - data = response.json() - if data == [] and i < 3: - i += 1 - print(" restart ", i) - sleep(1) - else: - restart = False - - - # Store the JSON response in the cache - cache[url] = json.dumps(data) - - return data + with dbm.open('cache', 'c') as cache: + data = cached_get(url, cache) + if data != []: + lat_lng = f"{data[0]['lat']},{data[0]['lon']}" + else: + sleep(1) + # drop housenumber (example: Vrhpolje 1D, 5271 Vipava) + url_2 = f"https://geocode.maps.co/search?street={' '.join(street.split()[:-1])}&town={ + town}&postalcode={postalcode}&country=Slovenia&country_code=si&api_key={api_key}" + print(" try remove house number") -def convert_address_to_lat_lng(address_to_convert: str) -> str: - # free plan: 1 request per second - sleep(1) - - print("address_to_convert: ", address_to_convert) - street, post_and_town = address_to_convert.split(',') - postalcode = post_and_town.strip().split()[0] - town = " ".join(post_and_town.strip().split()[1:]) - - url = f"https://geocode.maps.co/search?street={street}&town={town}&postalcode={postalcode}&country=Slovenia&country_code=si&api_key={api_key}" - - with dbm.open('cache', 'c') as cache: - data = cached_get(url, cache) - if data != []: - lat_lng = f"{data[0]['lat']},{data[0]['lon']}" - else: - sleep(1) - # drop housenumber (example: Vrhpolje 1D, 5271 Vipava) - url_2 = f"https://geocode.maps.co/search?street={' '.join(street.split()[:-1])}&town={town}&postalcode={postalcode}&country=Slovenia&country_code=si&api_key={api_key}" - - print(" try remove house number") - - data = cached_get(url_2, cache) - - if data != []: - lat_lng = f"{data[0]['lat']},{data[0]['lon']}" - else: - sleep(1) - # drop street (example: Gabrovlje 14, 3214 Zreče) - url_3 = f"https://geocode.maps.co/search?town={town}&postalcode={postalcode}&country=Slovenia&country_code=si&api_key={api_key}" - - print(" try remove street") - - data = cached_get(url_3, cache) - - if data != []: - lat_lng = f"{data[0]['lat']},{data[0]['lon']}" - else: - print(f'Empty response for: {address_to_convert}') - sys.exit(1) - - return lat_lng + data = cached_get(url_2, cache) + + if data != []: + lat_lng = f"{data[0]['lat']},{data[0]['lon']}" + else: + sleep(1) + # drop street (example: Gabrovlje 14, 3214 Zreče) + url_3 = f"https://geocode.maps.co/search?town={town}&postalcode={ + postalcode}&country=Slovenia&country_code=si&api_key={api_key}" + + print(" try remove street") + + data = cached_get(url_3, cache) + + if data != []: + lat_lng = f"{data[0]['lat']},{data[0]['lon']}" + else: + print(f'Empty response for: {address_to_convert}') + sys.exit(1) + + return lat_lng def main(): - if api_key == "": - print("missing API key") - sys.exit(1) + if api_key == "": + print("missing API key") + sys.exit(1) - df = pl.read_csv(file_name, separator=',') + df = pl.read_csv(file_name, separator=',') - # keep only needed columns - df_selected = df[:, [0, 1, 2]] + # keep only needed columns + df_selected = df[:, [0, 1, 2]] - # rename columns - new_column_names = ['code', 'name', 'address'] - df_renamed = df_selected.rename({i: j for i, j in zip(df_selected.columns, new_column_names)}) + # rename columns + new_column_names = ['code', 'name', 'address'] + df_renamed = df_selected.rename( + {i: j for i, j in zip(df_selected.columns, new_column_names)}) - # ignore rows if first column is "Approval No.", or if missing SI (sometimes just number or H + number) - df_filtered_tmp = df_renamed.filter(df_renamed['code'].str.starts_with("SI")) - df_filtered = df_filtered_tmp.with_columns(pl.col('code').map_elements(lambda x: clean_code(x), return_dtype=str)) + # ignore rows if first column is "Approval No.", or if missing SI (sometimes just number or H + number) + df_filtered_tmp = df_renamed.filter( + df_renamed['code'].str.starts_with("SI")) + df_filtered = df_filtered_tmp.with_columns( + pl.col('code').map_elements(lambda x: clean_code(x), return_dtype=str)) - # first column keep only first row (second row tell about business, meat processing, for example) - df_unique_name = df_filtered.with_columns(pl.col('name').map_elements(lambda x: "".join((x.split('\r')[0]).split(',')[0].title()), return_dtype=str)) + # first column keep only first row (second row tell about business, meat processing, for example) + df_unique_name = df_filtered.with_columns(pl.col('name').map_elements( + lambda x: "".join((x.split('\r')[0]).split(',')[0].title()), return_dtype=str)) - df_unique_address = df_unique_name.with_columns(pl.col('address').map_elements(lambda x: clean_address(x), return_dtype=str)) + df_unique_address = df_unique_name.with_columns( + pl.col('address').map_elements(lambda x: clean_address(x), return_dtype=str)) - # rm duplicates - df_deduplicated = df_unique_address.unique() + # rm duplicates + df_deduplicated = df_unique_address.unique() - df_lat_lng = df_deduplicated.with_columns(pl.col("address").map_elements(lambda x: convert_address_to_lat_lng(x), return_dtype=str).alias("lat_lng")) + df_lat_lng = df_deduplicated.with_columns(pl.col("address").map_elements( + lambda x: convert_address_to_lat_lng(x), return_dtype=str).alias("lat_lng")) - # split in 2 - df_lat = df_lat_lng.with_columns(pl.col('lat_lng').str.split(',').list.get(0).alias('lat')) - df_lng = df_lat.with_columns(pl.col('lat_lng').str.split(',').list.get(1).alias('lng')) + # split in 2 + df_lat = df_lat_lng.with_columns( + pl.col('lat_lng').str.split(',').list.get(0).alias('lat')) + df_lng = df_lat.with_columns( + pl.col('lat_lng').str.split(',').list.get(1).alias('lng')) - df_final = df_lng.drop(['lat_lng']) + df_final = df_lng.drop(['lat_lng']) - df_final.write_csv(output_file_name, separator=';') + df_final.write_csv(output_file_name, separator=';') if __name__ == "__main__": diff --git a/scripts/run_ocr.py b/scripts/run_ocr.py index ab71da6849991..58e54a922499f 100644 --- a/scripts/run_ocr.py +++ b/scripts/run_ocr.py @@ -25,7 +25,8 @@ logger = logging.getLogger() logger.setLevel(logging.DEBUG) -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s") stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) stream_handler.setFormatter(formatter) @@ -134,7 +135,8 @@ def run_ocr_on_image_paths(image_paths: List[pathlib.Path], override: bool = Fal r_json = orjson.loads(r.content) responses = r_json["responses"] return ( - [(images_content[i][0], responses[i]) for i in range(len(images_content))], + [(images_content[i][0], responses[i]) + for i in range(len(images_content))], True, ) @@ -142,7 +144,8 @@ def run_ocr_on_image_paths(image_paths: List[pathlib.Path], override: bool = Fal def dump_ocr( image_paths: List[pathlib.Path], sleep: float = 0.0, override: bool = False ): - responses, performed_request = run_ocr_on_image_paths(image_paths, override) + responses, performed_request = run_ocr_on_image_paths( + image_paths, override) for image_path, response in responses: json_path = image_path.with_suffix(".json.gz") @@ -150,7 +153,8 @@ def dump_ocr( with gzip.open(str(json_path), "wb") as f: logger.debug("Dumping OCR JSON to %s", json_path) f.write( - orjson.dumps({"responses": [response], "created_at": int(time.time())}) + orjson.dumps( + {"responses": [response], "created_at": int(time.time())}) ) if performed_request and sleep: time.sleep(sleep) diff --git a/scripts/snippets/agribalyse_categories_csv.py b/scripts/snippets/agribalyse_categories_csv.py index 8ba6b7465ba1b..d3a4a37748495 100644 --- a/scripts/snippets/agribalyse_categories_csv.py +++ b/scripts/snippets/agribalyse_categories_csv.py @@ -8,7 +8,8 @@ # get agribalyse -all_agri_cats = set(tagid for tagid, d in datas.items() for k in d.keys() if "agribalyse" in k) +all_agri_cats = set(tagid for tagid, d in datas.items() + for k in d.keys() if "agribalyse" in k) agri_cats = all_agri_cats # and children (until we have no more children) @@ -16,15 +17,18 @@ children = set( tagid for tagid, d in datas.items() - if set(d.get("parents",[])) & agri_cats + if set(d.get("parents", [])) & agri_cats ) all_agri_cats |= children agri_cats = children # get agribalyse exploring parents + + def agribalyse(tagid, depth=0): data = datas[tagid] - code = data.get("agribalyse_food_code", data.get("agribalyse_proxy_food_code", {})).get("en") + code = data.get("agribalyse_food_code", data.get( + "agribalyse_proxy_food_code", {})).get("en") if code: return code, depth # explore all parents and take the lowest depth @@ -50,13 +54,14 @@ def agribalyse(tagid, depth=0): "cat_en": d["name"].get("en", ""), "cat_fr": d["name"].get("fr", ""), "agribalyse_food_code": agri, - }) + }) len(rows) rows.sort(key=lambda r: r.get("cat_en") or r.get("cat_fr")) with open("environmental_scores-cat.csv", "w") as f: - writer = csv.DictWriter(f, fieldnames=["tagid", "cat_en", "cat_fr", "agribalyse_food_code"]) + writer = csv.DictWriter( + f, fieldnames=["tagid", "cat_en", "cat_fr", "agribalyse_food_code"]) writer.writeheader() writer.writerows(rows) diff --git a/scripts/update_tags_per_languages.py b/scripts/update_tags_per_languages.py index da7a2eda4948c..d006b223e84ed 100644 --- a/scripts/update_tags_per_languages.py +++ b/scripts/update_tags_per_languages.py @@ -67,17 +67,17 @@ "categories": "category", "countries": "country", "labels": "label", - "origins": "origin", + "origins": "origin", } headers = { - 'Accept': 'application/json', + 'Accept': 'application/json', 'User-Agent': 'UpdateTagsLanguages', } mapping_languages_countries = { "aa": "dj", - "ar": "world", # ar but categories are en: + "ar": "world", # ar but categories are en: "be": "by", "bg": "bg", "br": "fr", @@ -88,7 +88,7 @@ "de": "de", "el": "gr", "en": "world", - "xx": "world", # xx but categories are en: + "xx": "world", # xx but categories are en: "es": "es", "et": "ee", "fa": "ir", @@ -115,9 +115,10 @@ "zh": "cn", } + def get_from_api(get_call_url: str) -> dict: """Send a GET request to the given URL. - + :param get_call_url: the URL to send the request to :return: the API response @@ -129,9 +130,10 @@ def get_from_api(get_call_url: str) -> dict: ) if get_call_url_res.status_code != 200: - print(f"ERROR: when calling api. {get_call_url_res.status_code} status code. url: {get_call_url}") + print(f"ERROR: when calling api. { + get_call_url_res.status_code} status code. url: {get_call_url}") sys.exit() - + return get_call_url_res.json() @@ -162,7 +164,7 @@ def unknown_tags_taxonomy_comparison(api_result: dict, taxonomy_file_location: s pass # line is the last line last_tag = line.split(",")[0] - + # found the index of the last saved log last_tag_index = None for i, item in enumerate(all_tags): @@ -177,12 +179,12 @@ def unknown_tags_taxonomy_comparison(api_result: dict, taxonomy_file_location: s # not found, restart from the beginning else: log_file_1 = open(log_file_name_1.format(plural=tag_type), "w") - log_file_1.write("current tag, :found tag") + log_file_1.write("current tag, :found tag") # no file, start from the beginning else: log_file_1 = open(log_file_name_1.format(plural=tag_type), "w") - log_file_1.write("current tag, :found tag") - + log_file_1.write("current tag, :found tag") + for tag in all_tags: if tag["known"] == 0: # limit number of iterations @@ -191,20 +193,21 @@ def unknown_tags_taxonomy_comparison(api_result: dict, taxonomy_file_location: s tag_name = tag['name'] # should retrieve all "en:blablabla, tag_name" or "it:tag_name" - # the prefix is either the language or a comma. + # the prefix is either the language or a comma. # Suffix is either an end of line or a comma - tag_regex = re.compile(f'\n([a-z][a-z]:(?:[\w\s\-\']*\,-)*{tag_name})[,|\n]') + tag_regex = re.compile( + f'\n([a-z][a-z]:(?:[\w\s\-\']*\,-)*{tag_name})[,|\n]') tag_regex_res = tag_regex.findall(taxonomy_file_content) # found more than a single occurence in the taxonomy - # if exists, take value that correspond to "en" (i.e., unknown but + # if exists, take value that correspond to "en" (i.e., unknown but # already referenced in the taxonomy) # otherwise (i.e., only different languages than "en"), keep first occurence if len(tag_regex_res) > 1: # in the case that "en" is not in the list tag_regex_res_first = tag_regex_res[0] - + tag_regex_res = [x for x in tag_regex_res if "en:" in x] # "en" was not in the last put back first value in the list @@ -218,9 +221,11 @@ def unknown_tags_taxonomy_comparison(api_result: dict, taxonomy_file_location: s if tag_regex_res[0][:2] == "en": already_referenced_tags.append(tag['id']) else: - possible_wrong_language_tags[tag['id']] = tag_regex_res[0].split(',')[0] + possible_wrong_language_tags[tag['id']] = tag_regex_res[0].split(',')[ + 0] # save in the logs to ease resume if it crashes - log_file_1.write(f"\n{tag['id']},{tag_regex_res[0].split(',')[0]}") + log_file_1.write(f"\n{tag['id']},{ + tag_regex_res[0].split(',')[0]}") log_file_1.flush() # 0 occurences else: @@ -246,7 +251,7 @@ def unknown_tags_taxonomy_comparison(api_result: dict, taxonomy_file_location: s with open(output_file_name.format(plural=tag_type), "a") as output_possible_new_tag_file: output_possible_new_tag_file.write("possible_new_tags") for possible_new_tag in possible_new_tags: - output_possible_new_tag_file.write("\n" + possible_new_tag) + output_possible_new_tag_file.write("\n" + possible_new_tag) return @@ -262,20 +267,21 @@ def update_tags_field(tags_field_string: str, tags_field_lc: str, current_tag: s :return tags_field_string: updated tags_field_string """ - # language of the tags_field_string is the same as the language - # of the current tag that we want to remove, + # language of the tags_field_string is the same as the language + # of the current tag that we want to remove, # it will not be prefixed by the language. if tags_field_lc == current_tag[:2]: current_tag = current_tag.split(':')[1] # same if new tag is the same as the language if tags_field_lc == updated_tag[:2]: updated_tag = updated_tag.split(':')[1] - + # convert into list to better handle upper and lower cases, split and concatenation, spaces tags_fields = tags_field_string.split(",") # can contain upper case letters # create new list list as lower case - and remove space after commas (strip) - to get the index - tags_fields_lower = [x.lower().strip().replace(" ", "-", -1) for x in tags_fields] + tags_fields_lower = [x.lower().strip().replace(" ", "-", -1) + for x in tags_fields] # old tag is still in the field if current_tag in tags_fields_lower: @@ -306,17 +312,21 @@ def update_tags_field(tags_field_string: str, tags_field_lc: str, current_tag: s # is equivalent to leave the field as is tags_field_string = ",".join(tags_fields) - - + return tags_field_string def main(): - parser = argparse.ArgumentParser(description="Provide tags type (allergens, categories, countries, labels, origins). Also, provide environment (prod, dev), user and password") - parser.add_argument('--tags', required=True, help='tags type (allergens, categories, countries, labels, origins). Comma separated, and quotes') - parser.add_argument('--env', required=True, help='environment (prod, dev) to connect to openfoodfacts') - parser.add_argument('--user_id', help='user id to connect to openfoodfacts') - parser.add_argument('--password', help='password to connect to openfoodfacts') + parser = argparse.ArgumentParser( + description="Provide tags type (allergens, categories, countries, labels, origins). Also, provide environment (prod, dev), user and password") + parser.add_argument('--tags', required=True, + help='tags type (allergens, categories, countries, labels, origins). Comma separated, and quotes') + parser.add_argument('--env', required=True, + help='environment (prod, dev) to connect to openfoodfacts') + parser.add_argument( + '--user_id', help='user id to connect to openfoodfacts') + parser.add_argument( + '--password', help='password to connect to openfoodfacts') args = parser.parse_args() tags = args.tags.split(",") tags = [i.strip() for i in tags] @@ -339,7 +349,8 @@ def main(): env = "net" user = "off:off@" else: - print("Environment should be 'prod' or 'dev', unexpected value:", env, file=sys.stderr) + print("Environment should be 'prod' or 'dev', unexpected value:", + env, file=sys.stderr) sys.exit() for plural, singular in map_tags_field_url_parameter.items(): @@ -350,16 +361,19 @@ def main(): 'user_id': args.user_id, 'password': args.password, } - - # by default the query return 24 results. + + # by default the query return 24 results. # increase to 1000 (so far chips in EN (should be crisps in EN) # add max number of products for categories with ~550) - products_list_for_tag_url = f"https://{user}world.openfoodfacts.{env}/{singular}/{{tag_id_placeholder}}.json?page_size=1000" - - taxonomy_file_location = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', f'taxonomies/{plural}.txt')) - + products_list_for_tag_url = f"https://{user}world.openfoodfacts.{ + env}/{singular}/{{tag_id_placeholder}}.json?page_size=1000" + + taxonomy_file_location = os.path.abspath(os.path.join( + os.path.dirname(__file__), '..', f'taxonomies/{plural}.txt')) + # country is needed otherwise _lc will be "en" - post_call_url = f"https://{user}{{country}}.openfoodfacts.{env}/cgi/product_jqm2.pl" + post_call_url = f"https://{user}{{country}}.openfoodfacts.{ + env}/cgi/product_jqm2.pl" # if log_file_name_2 exists, it means step 1) and 2) completely ran already, hence, resume from step 3) if not os.path.exists(log_file_name_2.format(plural=plural)): @@ -368,12 +382,13 @@ def main(): # example: # api_result = { # "tags": [ - # {"id": "it:frankreich", "known": 0, "name": "frankreich", ...}, + # {"id": "it:frankreich", "known": 0, "name": "frankreich", ...}, # ], # } # 2) fetch unknown tags and look into taxonomy - unknown_tags_taxonomy_comparison(api_result, taxonomy_file_location, plural, dev) + unknown_tags_taxonomy_comparison( + api_result, taxonomy_file_location, plural, dev) # create second log file with open(log_file_name_2.format(plural=plural), "w"): @@ -399,33 +414,35 @@ def main(): # line is the last line if line != "current tag, updated tag, product code": last_tag, last_product = line.split(",")[0], line.split(",")[2] - # remove from possible_wrong_language_tags the product that were already updated, + # remove from possible_wrong_language_tags the product that were already updated, # we keep last tag because maybe all products were not updated in previous run - sorted_dict = dict(sorted(possible_wrong_language_tags.items())) - possible_wrong_language_tags = {k: v for k, v in sorted_dict.items() if k >= last_tag} + sorted_dict = dict( + sorted(possible_wrong_language_tags.items())) + possible_wrong_language_tags = { + k: v for k, v in sorted_dict.items() if k >= last_tag} resume = True # only header was in the file, restart from beginning - else: + else: with open(log_file_name_2.format(plural=plural), "w") as log_file_2: - log_file_2.write("current tag, updated tag, product code") + log_file_2.write("current tag, updated tag, product code") # file exists and is empty else: with open(log_file_name_2.format(plural=plural), "w") as log_file_2: - log_file_2.write("current tag, updated tag, product code") - + log_file_2.write("current tag, updated tag, product code") # limit number of iterations - # for dev, number of elements in possible_wrong_language_tags + # for dev, number of elements in possible_wrong_language_tags # can be changed in unknown_tags_taxonomy_comparison() for current_tag, updated_tag in possible_wrong_language_tags.items(): - # 3) get all products for this tag - all_products_for_tag = get_from_api(products_list_for_tag_url.format(tag_id_placeholder=current_tag))["products"] + # 3) get all products for this tag + all_products_for_tag = get_from_api(products_list_for_tag_url.format( + tag_id_placeholder=current_tag))["products"] # example: # all_products_for_tag = { # "products": [ - # {"categories": "Lait", "categories_lc": "en", ...}, + # {"categories": "Lait", "categories_lc": "en", ...}, # ], # } @@ -446,23 +463,26 @@ def main(): for i, product in enumerate(all_products_for_tag): if dev and i > 0: - break + break + + # 4) update tags_fields + updated_field = update_tags_field( + product[plural], product[f'{plural}_lc'], current_tag, updated_tag) - # 4) update tags_fields - updated_field = update_tags_field(product[plural], product[f'{plural}_lc'], current_tag, updated_tag) - # 5) finally, update if updated_field != product[plural] and not dev: # country is needed otherwise _lc will be "en" try: - country = mapping_languages_countries[product[f'{plural}_lc']] + country = mapping_languages_countries[product[f'{ + plural}_lc']] except KeyError: - print(f"ERROR: when updating product {product['code']}. Unknown country for this language: {product[f'{plural}_lc']}") + print(f"ERROR: when updating product { + product['code']}. Unknown country for this language: {product[f'{plural}_lc']}") sys.exit() data.update({ - 'code': product['code'], + 'code': product['code'], plural: updated_field, }) post_call_url_res = requests.post( @@ -471,16 +491,21 @@ def main(): headers=headers, ) if post_call_url_res.status_code != 200: - print(f"ERROR: when updating product {product['code']}. {post_call_url_res.status_code} status code") + print(f"ERROR: when updating product {product['code']}. { + post_call_url_res.status_code} status code") sys.exit() with open(log_file_name_2.format(plural=plural), "a") as log_file_2: - log_file_2.write(f"\n{current_tag},{updated_tag},{product['code']}") + log_file_2.write(f"\n{current_tag},{ + updated_tag},{product['code']}") log_file_2.flush() # finally, rename log files, next iteration should start from scratch - os.rename(log_file_name_1.format(plural=plural), log_file_name_1.format(plural=plural) + "_log") - os.rename(log_file_name_2.format(plural=plural), log_file_name_2.format(plural=plural) + "_log") + os.rename(log_file_name_1.format(plural=plural), + log_file_name_1.format(plural=plural) + "_log") + os.rename(log_file_name_2.format(plural=plural), + log_file_name_2.format(plural=plural) + "_log") + if __name__ == "__main__": main() diff --git a/scripts/update_tags_per_languages_tests.py b/scripts/update_tags_per_languages_tests.py index c97f85a5d2328..ccd1f26f9c61e 100644 --- a/scripts/update_tags_per_languages_tests.py +++ b/scripts/update_tags_per_languages_tests.py @@ -2,6 +2,7 @@ import unittest from update_tags_per_languages import get_from_api, unknown_tags_taxonomy_comparison, update_tags_field + class TestUpdateTagsPerLanguages(unittest.TestCase): def test_get_from_api(self): @@ -12,17 +13,17 @@ def test_get_from_api(self): "categories": "category", # "countries": "country", # "labels": "label", - # "origins": "origin", + # "origins": "origin", } - - for plural in map_tags_field_url_parameter.keys(): - tags_list_url = f"https://off:off@world.openfoodfacts.net/{plural}.json" - all_tags = get_from_api(tags_list_url) - self.assertTrue("tags" in all_tags) - self.assertTrue("id" in all_tags["tags"][0]) - self.assertTrue("known" in all_tags["tags"][0]) - self.assertTrue("name" in all_tags["tags"][0]) + for plural in map_tags_field_url_parameter.keys(): + tags_list_url = f"https://off:off@world.openfoodfacts.net/{ + plural}.json" + all_tags = get_from_api(tags_list_url) + self.assertTrue("tags" in all_tags) + self.assertTrue("id" in all_tags["tags"][0]) + self.assertTrue("known" in all_tags["tags"][0]) + self.assertTrue("name" in all_tags["tags"][0]) def test_get_from_api_products_list(self): products_list_for_tag_url = f"https://off:off@world.openfoodfacts.net/category/en:lait.json" @@ -30,19 +31,23 @@ def test_get_from_api_products_list(self): self.assertTrue("products" in all_products_for_tag) self.assertTrue("categories" in all_products_for_tag["products"][0]) self.assertTrue("categories_lc" in all_products_for_tag["products"][0]) - def test_unknown_tags_taxonomy_comparison_function(self): all_tags_dict = { "tags": [ - {"id": "en:snacks", "known": 1, "name": "snacks"}, # known - {"id": "en:groceries", "known": 0, "name": "groceries"}, # possible_new_tags - {"id": "en:cured-hams", "known": 0, "name": "cured-hams"}, # already_referenced_tags - {"id": "en:chips", "known": 0, "name": "chips"}, # possible_wrong_language_tags + {"id": "en:snacks", "known": 1, "name": "snacks"}, # known + {"id": "en:groceries", "known": 0, + "name": "groceries"}, # possible_new_tags + # already_referenced_tags + {"id": "en:cured-hams", "known": 0, "name": "cured-hams"}, + # possible_wrong_language_tags + {"id": "en:chips", "known": 0, "name": "chips"}, ], } - file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', f'taxonomies/categories.txt')) - unknown_tags_taxonomy_comparison(all_tags_dict, file, "categories", False) + file = os.path.abspath(os.path.join(os.path.dirname( + __file__), '..', f'taxonomies/categories.txt')) + unknown_tags_taxonomy_comparison( + all_tags_dict, file, "categories", False) possible_wrong_language_tags = {} with open('update_tags_per_languages_wrong_languages_detected_categories', 'r') as file: @@ -53,21 +58,26 @@ def test_unknown_tags_taxonomy_comparison_function(self): possible_wrong_language_tags[key] = value # (result, expected) - self.assertEqual(possible_wrong_language_tags, {'en:chips': 'de:chips'}) + self.assertEqual(possible_wrong_language_tags, + {'en:chips': 'de:chips'}) os.remove("update_tags_per_languages_wrong_languages_detected_categories") os.remove("update_tags_per_languages_possible_new_tags_categories") - def test_update_tags_field(self): - updated_field_1 = update_tags_field("Lait", "en", "en:lait", "fr:laits") + updated_field_1 = update_tags_field( + "Lait", "en", "en:lait", "fr:laits") self.assertEqual(updated_field_1, "fr:laits") - updated_field_2 = update_tags_field("Dairies,Milks,Lait", "en", "en:lait", "fr:laits") + updated_field_2 = update_tags_field( + "Dairies,Milks,Lait", "en", "en:lait", "fr:laits") self.assertEqual(updated_field_2, "Dairies,Milks, fr:laits") - updated_field_3 = update_tags_field("Snacks,Chips,Chips au paprika,Chips de pommes de terre,Chips de pommes de terre aromatisées,Chips et frites,Snacks salés", "en", "en:chips", "fr:chips") - self.assertEqual(updated_field_3, "Snacks, fr:chips,Chips au paprika,Chips de pommes de terre,Chips de pommes de terre aromatisées,Chips et frites,Snacks salés") + updated_field_3 = update_tags_field( + "Snacks,Chips,Chips au paprika,Chips de pommes de terre,Chips de pommes de terre aromatisées,Chips et frites,Snacks salés", "en", "en:chips", "fr:chips") + self.assertEqual( + updated_field_3, "Snacks, fr:chips,Chips au paprika,Chips de pommes de terre,Chips de pommes de terre aromatisées,Chips et frites,Snacks salés") + if __name__ == '__main__': unittest.main() diff --git a/scripts/usda-import/keep_most_recent_row_for_each_product.py b/scripts/usda-import/keep_most_recent_row_for_each_product.py index 95a98c5dd9a2a..65c982b312d6c 100644 --- a/scripts/usda-import/keep_most_recent_row_for_each_product.py +++ b/scripts/usda-import/keep_most_recent_row_for_each_product.py @@ -1,8 +1,9 @@ import pandas as pd import sys -if (len(sys.argv) != 2): - sys.stderr.write("Usage: python keep_most_recent_row_for_ech_product.pl [input CSV file] > [output CSV file]\n") +if (len(sys.argv) != 2): + sys.stderr.write( + "Usage: python keep_most_recent_row_for_ech_product.pl [input CSV file] > [output CSV file]\n") sys.exit() input_csv_file = sys.argv[1] @@ -16,4 +17,4 @@ # In practice, it is sorted by fdc_id, which seems to be a sequence ordered by date df.drop_duplicates(subset=['code'], keep='last', inplace=True) -df.to_csv(sys.stdout, sep='\t', quoting=None, index=False) \ No newline at end of file +df.to_csv(sys.stdout, sep='\t', quoting=None, index=False)