Skip to content

Commit 955cecc

Browse files
authored
feat(GDPR): improve script to manage Picard (#430)
1 parent ed09039 commit 955cecc

File tree

5 files changed

+178
-55
lines changed

5 files changed

+178
-55
lines changed

scripts/gdpr/README.md

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,26 @@
1-
# GDPR request data
1+
# Uploading GDPR price data
22

33
## Context
44

5-
One of our data sources is GDPR request to supermarkets. See https://wiki.openfoodfacts.org/GDPR_request
5+
One of our data sources is GDPR request to supermarkets (with fidelity cards).
6+
7+
See https://wiki.openfoodfacts.org/GDPR_request
68

79
## List of supermarkets
810

9-
|Supermarket|Data|Preprocessing|
10-
|-----------|---|---|
11-
|Auchan |1 single file||
11+
|Supermarket|Data |Preprocessing|
12+
|-----------|------------------|---|
13+
|Auchan |1 single file ||
1214
|Carrefour |1 file with 2 tabs|- merge files<br/>- skip discounts|
13-
|E.Leclerc |2 files|- merge files|
14-
|Intermarché|1 single file||
15+
|E.Leclerc |2 files |- merge files|
16+
|Intermarché|1 single file ||
17+
|Picard |1 file with multiple tables|- create seperate files<br>- merge files|
1518

1619
## Usage
1720

18-
### Step 1: get an API token
21+
### Step 1: get your API token from Open Prices
1922

20-
https://prices.openfoodfacts.org/api/docs#/Auth/authentication_api_v1_auth_post
23+
https://prices.openfoodfacts.org/api/docs#/auth/auth_create
2124

2225
### Step 2: upload a proof
2326

@@ -42,7 +45,7 @@ Depending on the source, you'll need to provide the correct `LOCATION` key, and
4245
Use the token returned in Step 1.
4346

4447
```
45-
FILEPATH=../data/Carrefour/Carte_Carrefour_NAME_merged.csv SOURCE=CARREFOUR LOCATION="City Jaures Grenoble" LOCATION_OSM_ID=1697821864 LOCATION_OSM_TYPE=NODE PROOF_ID=1234 API_ENDPOINT=https://prices.openfoodfacts.net/api/v1 API_TOKEN=username_token-hash poetry run python data/gdpr/create_prices_from_gdpr_csv.py
48+
FILEPATH=../data/Carrefour/Carte_Carrefour_NAME_merged.csv SOURCE=CARREFOUR LOCATION="City Jaures Grenoble" LOCATION_OSM_ID=1697821864 LOCATION_OSM_TYPE=NODE PROOF_ID=1234 API_ENDPOINT=https://prices.openfoodfacts.net/api/v1 API_TOKEN=username_token-hash poetry run python scripts/gdpr/create_prices_from_gdpr_csv.py
4649
```
4750

4851
Last changes when you're ready:
@@ -55,12 +58,25 @@ Last changes when you're ready:
5558

5659
Script name: `merge_two_csv_files.csv`
5760

61+
Goal: merge and enrich data from the second csv file into the first csv file.
62+
63+
#### E.Leclerc
64+
5865
E.Leclerc returns 2 different files, one containing a list of receipts (with dates & locations), and the other a list of products with their receipt id. So we need to first merge the 2 files into 1.
5966
```
6067
(TODO)
6168
```
6269

70+
#### Carrefour
71+
6372
For Carrefour, the file contains 2 tabs, 1 called "Tickets" and the other called "Remise".
6473
```
65-
FILEPATH_1=Carte_Carrefour_NAME_liste_tickets_Tickets.csv FILEPATH_2=Carte_Carrefour_NAME_liste_tickets_Remises.csv PIVOT_FIELD_NAME="Numéro du ticket de caisse magasin,Code Barre du produit,Description du produit" poetry run python data/gdpr/merge_two_csv_files.py
74+
FILEPATH_1=Carte_Carrefour_NAME_liste_tickets_Tickets.csv FILEPATH_2=Carte_Carrefour_NAME_liste_tickets_Remises.csv PIVOT_FIELD_NAME_LIST="Numéro du ticket de caisse magasin,Code Barre du produit,Description du produit" poetry run python scripts/gdpr/merge_two_csv_files.py
75+
```
76+
77+
#### Picard
78+
79+
Picard returns 1 spreadsheet containing multiple tables. We first need to store the Product table & the Tickets table in 2 seperate csv files.
80+
```
81+
FILEPATH_1=Picard_Produits.csv FILEPATH_2=Picard_Tickets.csv PIVOT_FIELD_NAME_LIST="NUMERO DE TICKET" EXCLUDE_FIELD_NAME_LIST="PRIX TTC" poetry run python scripts/gdpr/merge_two_csv_files.py
6682
```

scripts/gdpr/create_prices_from_gdpr_csv.py

Lines changed: 41 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@
55
import time
66

77
import requests
8+
from utils import get_picard_product_from_subcode
89

910
OPEN_PRICES_CREATE_PRICE_ENDPOINT = f'{os.environ.get("API_ENDPOINT")}/prices'
1011
OPEN_PRICES_TOKEN = os.environ.get("API_TOKEN")
11-
GDPR_FIELD_MAPPING_FILEPATH = "data/gdpr/gdpr_field_mapping.csv"
12+
13+
GDPR_FIELD_MAPPING_FILEPATH = "scripts/gdpr/gdpr_field_mapping.csv"
1214

1315
DEFAULT_PRICE_CURRENCY = "EUR"
1416
PRICE_FIELDS = [
@@ -44,10 +46,14 @@ def gdpr_source_field_cleanup_rules(gdpr_source, op_field, gdpr_field_value):
4446
# remove any whitespace
4547
gdpr_field_value = gdpr_field_value.strip()
4648

47-
# shop specific rules
48-
if gdpr_source == "AUCHAN":
49-
if op_field == "price":
49+
# field-specific rules
50+
if op_field in ["price", "quantity"]:
51+
if gdpr_field_value:
5052
gdpr_field_value = float(gdpr_field_value.replace(",", "."))
53+
54+
# shop-specific rules
55+
if gdpr_source == "AUCHAN":
56+
pass
5157
elif gdpr_source == "CARREFOUR":
5258
# input: |3178050000749|
5359
# output: 3178050000749
@@ -62,15 +68,18 @@ def gdpr_source_field_cleanup_rules(gdpr_source, op_field, gdpr_field_value):
6268
elif gdpr_source == "ELECLERC":
6369
pass
6470
elif gdpr_source == "INTERMARCHE":
65-
if op_field in ["price", "quantity"]:
66-
# divide price by quantity
67-
gdpr_field_value = float(gdpr_field_value.replace(",", "."))
6871
# input: 27/05/2021
6972
# output: 2021-05-27
7073
if op_field == "date":
7174
gdpr_field_value = datetime.datetime.strptime(
7275
gdpr_field_value, "%d/%m/%Y"
7376
).strftime("%Y-%m-%d")
77+
elif gdpr_source == "PICARD":
78+
# Picard codes are a subset of the EAN codes
79+
# They have a length of 5 (4 if missing leading 0)
80+
if op_field == "product_code":
81+
if len(gdpr_field_value) == 4:
82+
gdpr_field_value = f"0{gdpr_field_value}"
7483

7584
return gdpr_field_value
7685

@@ -79,15 +88,15 @@ def gdpr_source_price_cleanup_rules(gdpr_source, gdpr_op_price):
7988
"""
8089
Rules to cleanup the price object
8190
"""
82-
if gdpr_source == "AUCHAN":
83-
pass
84-
elif gdpr_source == "CARREFOUR":
85-
pass
86-
elif gdpr_source == "ELECLERC":
87-
pass
88-
elif gdpr_source == "INTERMARCHE":
89-
# price must be divided by quantity
90-
gdpr_op_price["price"] = gdpr_op_price["price"] / gdpr_op_price["quantity"]
91+
# price must be divided by quantity
92+
if "quantity" in gdpr_op_price:
93+
if gdpr_op_price["quantity"]:
94+
gdpr_op_price["price"] = gdpr_op_price["price"] / gdpr_op_price["quantity"]
95+
96+
# discount boolean flag
97+
if "discount" in gdpr_op_price:
98+
if gdpr_op_price["discount"]:
99+
gdpr_op_price["price_is_discounted"] = True
91100

92101
return gdpr_op_price
93102

@@ -135,6 +144,12 @@ def gdpr_source_filter_rules(op_price_list, gdpr_source=""):
135144
passes_test = False
136145
elif gdpr_source == "INTERMARCHE":
137146
pass
147+
elif gdpr_source == "PICARD":
148+
full_product_code = get_picard_product_from_subcode(op_price)
149+
if full_product_code:
150+
op_price["product_code"] = full_product_code
151+
else:
152+
passes_test = False
138153

139154
if passes_test:
140155
op_price_list_filtered.append(op_price)
@@ -219,7 +234,7 @@ def create_price(price):
219234
if __name__ == "__main__":
220235
"""
221236
How-to run:
222-
> FILEPATH= poetry run python data/gdpr/create_prices_from_gdpr_csv.py
237+
> FILEPATH= poetry run python scripts/gdpr/create_prices_from_gdpr_csv.py
223238
Required params: see REQUIRED_ENV_PARAMS
224239
"""
225240
# Step 1: read input file
@@ -256,21 +271,21 @@ def create_price(price):
256271
)
257272
print(len(open_prices_price_list))
258273

259-
# Step 4a: filter prices depending on specific source rules
260-
print("===== Applying source filtering rules")
261-
open_prices_price_list_filtered_1 = gdpr_source_filter_rules(
262-
open_prices_price_list, gdpr_source=source
274+
# Step 4a: filter prices depending on location
275+
print("===== Applying location filtering rules")
276+
open_prices_price_list_filtered_1 = gdpr_source_location_rules(
277+
open_prices_price_list
263278
)
264279
print(len(open_prices_price_list_filtered_1))
265280

266-
# Step 4b: filter prices depending on location
267-
print("===== Applying location filtering rules")
268-
open_prices_price_list_filtered_2 = gdpr_source_location_rules(
269-
open_prices_price_list_filtered_1
281+
# Step 4b: filter prices depending on specific source rules
282+
print("===== Applying source filtering rules")
283+
open_prices_price_list_filtered_2 = gdpr_source_filter_rules(
284+
open_prices_price_list_filtered_1, gdpr_source=source
270285
)
271286
print(len(open_prices_price_list_filtered_2))
272287

273-
print("===== Output example (extra fields will be ignored):")
288+
print("===== Output example (extra fields will be ignored)")
274289
print(open_prices_price_list_filtered_2[0])
275290

276291
# Step 5: send prices to backend via API
Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
OPEN_PRICES_FIELD,AUCHAN_FIELD,AUCHAN_COMMENT,CARREFOUR_FIELD,CARREFOUR_COMMENT,ELECLERC_FIELD,ELECLERC_COMMENT,INTERMARCHE_FIELD,INTERMARCHE_COMMENT
2-
product_code,CODE_PRODUIT,"raw products have a length of 4 or 12 or 13 but ending with lots of 0 (fruits, vegetables, meat, cheese) (ex: 4400, 200512000000, 2630329000000)",Code Barre du produit,prefixed and suffixed with |,ean,,COD_ARTC_EAN,duplicate column with EAN_GD
3-
product_name,NOM_PRODUIT,,Description du produit,,article_libelle,,LB_ARTC,duplicate column with LB_COMM0
4-
price,PRIX_UNITAIRE,,Prix unitaire TTC avec remise (€),,article_prix_unitaire,,CA TTC Produit,has commas instead of points
5-
discount,,,"Remise sur le produit (€) (chaque remise d'un produit regroupe les promotions, les avantages de la carte PASS, les bons d'achats… appliqués lors du passage en caisse)",,,,,
6-
quantity,,,Quantité,,,,Qte Vendues,
7-
date,JOUR,format YYYY-MM-DD,Date de transaction,format DD/MM/YYYY,date_ticket,format YYYY-MM-DD,DT_TICK,format DD/MM/YYYY
8-
location,CODE_POSTAL,,NOM DU MAGASIN,,code_postal,,LB_COMM,
1+
OPEN_PRICES_FIELD,AUCHAN_FIELD,AUCHAN_COMMENT,CARREFOUR_FIELD,CARREFOUR_COMMENT,ELECLERC_FIELD,ELECLERC_COMMENT,INTERMARCHE_FIELD,INTERMARCHE_COMMENT,PICARD_FIELD,PICARD_COMMENT
2+
product_code,CODE_PRODUIT,"raw products have a length of 4 or 12 or 13 but ending with lots of 0 (fruits, vegetables, meat, cheese) (ex: 4400, 200512000000, 2630329000000)",Code Barre du produit,prefixed and suffixed with |,ean,,COD_ARTC_EAN,duplicate column with EAN_GD,CODE PRODUIT,a 5-number code. need to do an extra API search to find the corresponding product
3+
product_name,NOM_PRODUIT,,Description du produit,,article_libelle,,LB_ARTC,duplicate column with LB_COMM0,LIBELLE ARTICLE,
4+
price,PRIX_UNITAIRE,,Prix unitaire TTC avec remise (€),,article_prix_unitaire,,CA TTC Produit,has commas instead of points,PRIX TTC,has commas instead of points
5+
discount,,,"Remise sur le produit (€) (chaque remise d'un produit regroupe les promotions, les avantages de la carte PASS, les bons d'achats… appliqués lors du passage en caisse)",,,,,,IDENTIFIANT REMISE,a string ID to another table
6+
quantity,,,Quantité,,,,Qte Vendues,,NOMBRE UNITES,
7+
date,JOUR,format YYYY-MM-DD,Date de transaction,format DD/MM/YYYY,date_ticket,format YYYY-MM-DD,DT_TICK,format DD/MM/YYYY,DATE TICKET,
8+
location,CODE_POSTAL,,NOM DU MAGASIN,,code_postal,,LB_COMM,,NOM DU MAGASIN,

scripts/gdpr/merge_two_csv_files.py

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,37 @@ def read_csv(filepath):
1313
return data
1414

1515

16-
def merge_data_of_two_lists(list_1, list_2, pivot_list=["ticket"]):
17-
print(pivot_list)
16+
def merge_data_of_two_lists(
17+
list_1, list_2, pivot_field_name_list=["ticket"], exclude_field_name_list=[]
18+
):
1819
data_merged = list()
1920

2021
for row_1 in list_1:
2122
row_2 = None
23+
# find corresponding row in list_2
2224
for row in list_2:
23-
if all(row_1[pivot] == row[pivot] for pivot in pivot_list):
25+
if all(
26+
row_1[pivot_field_name] == row[pivot_field_name]
27+
for pivot_field_name in pivot_field_name_list
28+
):
2429
row_2 = row
2530
if not row_2:
2631
row_2 = {
27-
**{key: row_1[key] for key in list_2[0].keys() if key in pivot_list},
28-
**{key: "" for key in list_2[0].keys() if key not in pivot_list},
32+
**{
33+
key: row_1[key]
34+
for key in list_2[0].keys()
35+
if key in pivot_field_name_list
36+
},
37+
**{
38+
key: ""
39+
for key in list_2[0].keys()
40+
if key not in pivot_field_name_list
41+
},
2942
}
43+
# cleanup row_2
44+
for exclude_field_name in exclude_field_name_list:
45+
row_2.pop(exclude_field_name, None)
46+
# merge
3047
data_merged.append({**row_1, **row_2})
3148

3249
return data_merged
@@ -44,12 +61,14 @@ def write_csv(data, filepath):
4461
if __name__ == "__main__":
4562
"""
4663
How-to run:
47-
> FILEPATH_1= FILEPATH_2= PIVOT_FIELD_NAME= poetry run python data/gdpr/merge_two_csv_files.py # noqa
64+
> FILEPATH_1= FILEPATH_2= PIVOT_FIELD_NAME_LIST= EXCLUDE_FIELD_NAME_LIST= poetry run python scripts/gdpr/merge_two_csv_files.py # noqa
4865
"""
4966
filepath_1 = os.environ.get("FILEPATH_1")
5067
filepath_2 = os.environ.get("FILEPATH_2")
51-
pivot_field_name = os.environ.get("PIVOT_FIELD_NAME")
52-
pivot_field_name_list = pivot_field_name.split(",")
68+
pivot_field_name_str = os.environ.get("PIVOT_FIELD_NAME_LIST")
69+
pivot_field_name_list = pivot_field_name_str.split(",")
70+
exclude_field_name_str = os.environ.get("EXCLUDE_FIELD_NAME_LIST")
71+
exclude_field_name_list = exclude_field_name_str.split(",")
5372
output_filepath = filepath_1.split(".csv")[0] + "_merged.csv"
5473

5574
print(f"Step 1: reading {filepath_1}")
@@ -60,9 +79,14 @@ def write_csv(data, filepath):
6079
data_2 = read_csv(filepath_2)
6180
print(f"{len(data_2)} lines")
6281

63-
print(f"Step 3: merging the two lists with pivot(s): {pivot_field_name_list}")
82+
print(
83+
f"Step 3: merging the two lists with pivot(s): {pivot_field_name_list} (and excluding: {exclude_field_name_list})"
84+
)
6485
data_merged = merge_data_of_two_lists(
65-
data_1, data_2, pivot_list=pivot_field_name_list
86+
data_1,
87+
data_2,
88+
pivot_field_name_list=pivot_field_name_list,
89+
exclude_field_name_list=exclude_field_name_list,
6690
)
6791
print(f"{len(data_merged)} lines")
6892

scripts/gdpr/utils.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import requests
2+
3+
OFF_SEARCHLICIOUS_API_ENDPOINT = "https://search.openfoodfacts.org/search"
4+
PICARD_GS1_PREFIX = "327016"
5+
6+
7+
def get_picard_product_from_subcode(op_price_dict):
8+
# the Picard product_code is incomplete
9+
# use Search-a-licious API to get the full product code
10+
# if needed, prompt the user to select the correct one
11+
passes_test = True
12+
full_product_code = None
13+
14+
print(
15+
"----- Input:",
16+
op_price_dict["product_code"],
17+
op_price_dict["product_name"],
18+
op_price_dict["price"],
19+
)
20+
for q_index, q_params in enumerate(
21+
[
22+
f"code:{PICARD_GS1_PREFIX}?{op_price_dict['product_code']}? brands:picard",
23+
f"code:{PICARD_GS1_PREFIX}?{op_price_dict['product_code']}?",
24+
f"code:*{op_price_dict['product_code']}? brands:picard",
25+
f"code:*{op_price_dict['product_code']}?&page_size=50",
26+
]
27+
):
28+
response = requests.get(
29+
OFF_SEARCHLICIOUS_API_ENDPOINT,
30+
params={"q": q_params},
31+
)
32+
print(response.url)
33+
if response.status_code == 200:
34+
response_product_count = response.json()["count"]
35+
print("Products found:", response_product_count)
36+
if response_product_count:
37+
# confidence strong enough: take the first product
38+
if (q_index < 2) and (response_product_count == 1):
39+
full_product_code = response.json()["hits"][0]["code"]
40+
else:
41+
# multiple results: prompt the user to select
42+
response_product_list = response.json()["hits"]
43+
for index, response_product in enumerate(response_product_list):
44+
print(
45+
index + 1,
46+
":",
47+
response_product.get("code"),
48+
response_product.get("product_name", ""),
49+
response_product.get("brands_tags", ""),
50+
response_product.get("stores", ""),
51+
)
52+
user_choice_number_str = input(
53+
"Which product ? Type 0 to skip. Or provide the correct code. "
54+
)
55+
if len(user_choice_number_str) == 1:
56+
full_product_code = response_product_list[
57+
int(user_choice_number_str) - 1
58+
]["code"]
59+
print("Chosen product code:", full_product_code)
60+
elif 3 < len(user_choice_number_str) <= 13:
61+
full_product_code = user_choice_number_str
62+
print("Chosen product code:", full_product_code)
63+
else:
64+
print("Product not found...")
65+
passes_test = False
66+
break
67+
68+
return passes_test, full_product_code

0 commit comments

Comments
 (0)