Skip to content

Commit b9c92f9

Browse files
committed
Adding process to export data from bigquery with signed URL.
1 parent 6c0a09b commit b9c92f9

File tree

2 files changed

+269
-0
lines changed

2 files changed

+269
-0
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
How to run it
2+
----
3+
4+
Define a fine called .end with the following content:
5+
6+
```
7+
CAROLAPPNAME=app_name
8+
CAROLTENANT=tenant_name
9+
CAROLORGANIZATION=org_name
10+
CAROLAPPOAUTH=api_key_authorization
11+
CAROLCONNECTORID=api_key_connector_id
12+
```
Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 149,
6+
"id": "7b62a9d3-13fa-4e73-91c2-6d44f6319c95",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"# Import necessary libraries\n",
11+
"import pandas as pd\n",
12+
"from datetime import date\n",
13+
"import datetime\n",
14+
"\n",
15+
"from google.cloud import bigquery\n",
16+
"from google.oauth2 import service_account\n",
17+
"from google.cloud import storage\n",
18+
"\n",
19+
"from dotenv import load_dotenv\n",
20+
"from pycarol.apps import Apps\n",
21+
"from pycarol.bigquery import TokenManager\n",
22+
"from pycarol import Carol"
23+
]
24+
},
25+
{
26+
"cell_type": "code",
27+
"execution_count": 190,
28+
"id": "78f267f0-67f9-43a2-aa84-634a5d798e9f",
29+
"metadata": {},
30+
"outputs": [],
31+
"source": [
32+
"def generate_download_signed_url_v4(bucket_name, blob_name, credentials):\n",
33+
" \"\"\"Generates a v4 signed URL for downloading a blob.\n",
34+
"\n",
35+
" Note that this method requires a service account key file. You can not use\n",
36+
" this if you are using Application Default Credentials from Google Compute\n",
37+
" Engine or from the Google Cloud SDK.\n",
38+
" \"\"\"\n",
39+
" # bucket_name = 'your-bucket-name'\n",
40+
" # blob_name = 'your-object-name'\n",
41+
"\n",
42+
" storage_client = storage.Client(credentials=credentials)\n",
43+
" bucket = storage_client.bucket(bucket_name)\n",
44+
" blob = bucket.blob(blob_name)\n",
45+
"\n",
46+
" url = blob.generate_signed_url(\n",
47+
" version=\"v4\",\n",
48+
" # This URL is valid for 15 minutes\n",
49+
" expiration=datetime.timedelta(minutes=15),\n",
50+
" # Allow GET requests using this URL.\n",
51+
" method=\"GET\",\n",
52+
" )\n",
53+
"\n",
54+
" print(\"Generated GET signed URL:\")\n",
55+
" print(url)\n",
56+
" print(\"You can use this URL with any user agent, for example:\")\n",
57+
" print(f\"curl '{url}'\")\n",
58+
" return url"
59+
]
60+
},
61+
{
62+
"cell_type": "code",
63+
"execution_count": 191,
64+
"id": "f4be0e22-bf0b-4036-bfd4-9f7610271b6b",
65+
"metadata": {},
66+
"outputs": [
67+
{
68+
"data": {
69+
"text/plain": [
70+
"True"
71+
]
72+
},
73+
"execution_count": 191,
74+
"metadata": {},
75+
"output_type": "execute_result"
76+
}
77+
],
78+
"source": [
79+
"load_dotenv(\".env\")"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": 192,
85+
"id": "12156bf1-0a68-4161-ba97-e77f62fcfdff",
86+
"metadata": {},
87+
"outputs": [],
88+
"source": [
89+
"carol = Carol()\n",
90+
"tokenManager = TokenManager(carol, None, False)"
91+
]
92+
},
93+
{
94+
"cell_type": "code",
95+
"execution_count": 193,
96+
"id": "73b92bf3-1b42-49e9-9eef-a0e02a77cd99",
97+
"metadata": {},
98+
"outputs": [],
99+
"source": [
100+
"tokenGcp = tokenManager.get_token()\n",
101+
"credentials = service_account.Credentials.from_service_account_info(tokenGcp.to_dict()['service_account'])\n",
102+
"client = bigquery.Client(credentials=credentials)"
103+
]
104+
},
105+
{
106+
"cell_type": "code",
107+
"execution_count": 194,
108+
"id": "20cbd8b7-fa44-488d-9c32-3859f8753ab5",
109+
"metadata": {},
110+
"outputs": [],
111+
"source": [
112+
"envId = tokenGcp.to_dict()['env']['env_id']\n",
113+
"projectId = tokenGcp.to_dict()['service_account']['project_id']"
114+
]
115+
},
116+
{
117+
"cell_type": "code",
118+
"execution_count": 195,
119+
"id": "6352ded5-4bfe-4228-a307-962e7b995125",
120+
"metadata": {},
121+
"outputs": [],
122+
"source": [
123+
"job_config = bigquery.ExtractJobConfig()\n",
124+
"job_config.compression = bigquery.Compression.SNAPPY\n",
125+
"job_config.destination_format = (bigquery.DestinationFormat.PARQUET)"
126+
]
127+
},
128+
{
129+
"cell_type": "code",
130+
"execution_count": 196,
131+
"id": "74618814-275d-44c3-9341-ccf332c3923c",
132+
"metadata": {},
133+
"outputs": [
134+
{
135+
"name": "stdout",
136+
"output_type": "stream",
137+
"text": [
138+
"gs://carol-00b66d7bb91a4e43ae8e/user_space/export-ingestion_mdbusinesspartner-2024-02-26.parquet\n"
139+
]
140+
}
141+
],
142+
"source": [
143+
"table_id = 'ingestion_mdbusinesspartner'\n",
144+
"dateNow = date.today().isoformat()\n",
145+
"\n",
146+
"destination_uri = \"gs://{}/user_space/{}\".format(projectId, \"export-\" + table_id + \"-\" + dateNow + \".parquet\")\n",
147+
"print(destination_uri)\n",
148+
"\n",
149+
"table_ref = \"{}.{}.{}\".format(projectId, envId, table_id)"
150+
]
151+
},
152+
{
153+
"cell_type": "code",
154+
"execution_count": 197,
155+
"id": "be695092-514b-4f22-828b-06c6ac4a0604",
156+
"metadata": {},
157+
"outputs": [
158+
{
159+
"name": "stdout",
160+
"output_type": "stream",
161+
"text": [
162+
"ExtractJob<project=carol-00b66d7bb91a4e43ae8e, location=US, id=413d0bb6-659e-4440-a15d-20da9746267b>\n",
163+
"Exported carol-00b66d7bb91a4e43ae8e:00b66d7bb91a4e43ae8e17649fb1a8fb.carol-00b66d7bb91a4e43ae8e.00b66d7bb91a4e43ae8e17649fb1a8fb.ingestion_mdbusinesspartner to gs://carol-00b66d7bb91a4e43ae8e/user_space/export-ingestion_mdbusinesspartner-2024-02-26.parquet\n"
164+
]
165+
}
166+
],
167+
"source": [
168+
"extract_job = client.extract_table(\n",
169+
" table_ref,\n",
170+
" destination_uri,\n",
171+
" job_config=job_config,\n",
172+
" # Location must match that of the source table.\n",
173+
" location=\"US\",\n",
174+
") # API request\n",
175+
"a = extract_job.result() # Waits for job to complete.\n",
176+
"print(a)\n",
177+
"print(\"Exported {}:{}.{} to {}\".format(projectId, envId, table_ref, destination_uri))"
178+
]
179+
},
180+
{
181+
"cell_type": "code",
182+
"execution_count": 198,
183+
"id": "73d95f88-e5d9-40cc-898e-663e88bf59ef",
184+
"metadata": {},
185+
"outputs": [
186+
{
187+
"name": "stdout",
188+
"output_type": "stream",
189+
"text": [
190+
"Generated GET signed URL:\n",
191+
"https://storage.googleapis.com/carol-00b66d7bb91a4e43ae8e/user_space/export-ingestion_mdbusinesspartner-2024-02-26.parquet?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=tmpaccess-20240227t071214105z%40carol-00b66d7bb91a4e43ae8e.iam.gserviceaccount.com%2F20240227%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240227T032204Z&X-Goog-Expires=900&X-Goog-SignedHeaders=host&X-Goog-Signature=ce87f7319392267a18bb543a3c6004c15d15e0c7d3d28bf4fa3b57956814f44c7bb04447ef466e978ec68fc23ebd50ed9e6271e855f608c6f8498195fffc99db39a40d3fe6154a369f653e85ae478a22eb2002820c8a4bb70de1bd4d82840f9b7cbc79afca90fd8edb3a0080b342457d773cd2f266823051f8537eb05d3e76a83767aa11c73a164bebb7af0c2f9f58e1e175c3a449ecd9cedc17772c554489ec737b63f8f084e698ec8029ab672cbb65d26e119730ee67359084d666005a791dc874d8da594f744eda385e595f7a74b10dea82c9cec5f3483f40e10b28edcc746dbceb8c9ac0175d8b6df104753710f7ded3ff66c45a6e2a65885634fa024408\n",
192+
"You can use this URL with any user agent, for example:\n",
193+
"curl 'https://storage.googleapis.com/carol-00b66d7bb91a4e43ae8e/user_space/export-ingestion_mdbusinesspartner-2024-02-26.parquet?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=tmpaccess-20240227t071214105z%40carol-00b66d7bb91a4e43ae8e.iam.gserviceaccount.com%2F20240227%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240227T032204Z&X-Goog-Expires=900&X-Goog-SignedHeaders=host&X-Goog-Signature=ce87f7319392267a18bb543a3c6004c15d15e0c7d3d28bf4fa3b57956814f44c7bb04447ef466e978ec68fc23ebd50ed9e6271e855f608c6f8498195fffc99db39a40d3fe6154a369f653e85ae478a22eb2002820c8a4bb70de1bd4d82840f9b7cbc79afca90fd8edb3a0080b342457d773cd2f266823051f8537eb05d3e76a83767aa11c73a164bebb7af0c2f9f58e1e175c3a449ecd9cedc17772c554489ec737b63f8f084e698ec8029ab672cbb65d26e119730ee67359084d666005a791dc874d8da594f744eda385e595f7a74b10dea82c9cec5f3483f40e10b28edcc746dbceb8c9ac0175d8b6df104753710f7ded3ff66c45a6e2a65885634fa024408'\n"
194+
]
195+
},
196+
{
197+
"data": {
198+
"text/plain": [
199+
"'https://storage.googleapis.com/carol-00b66d7bb91a4e43ae8e/user_space/export-ingestion_mdbusinesspartner-2024-02-26.parquet?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=tmpaccess-20240227t071214105z%40carol-00b66d7bb91a4e43ae8e.iam.gserviceaccount.com%2F20240227%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240227T032204Z&X-Goog-Expires=900&X-Goog-SignedHeaders=host&X-Goog-Signature=ce87f7319392267a18bb543a3c6004c15d15e0c7d3d28bf4fa3b57956814f44c7bb04447ef466e978ec68fc23ebd50ed9e6271e855f608c6f8498195fffc99db39a40d3fe6154a369f653e85ae478a22eb2002820c8a4bb70de1bd4d82840f9b7cbc79afca90fd8edb3a0080b342457d773cd2f266823051f8537eb05d3e76a83767aa11c73a164bebb7af0c2f9f58e1e175c3a449ecd9cedc17772c554489ec737b63f8f084e698ec8029ab672cbb65d26e119730ee67359084d666005a791dc874d8da594f744eda385e595f7a74b10dea82c9cec5f3483f40e10b28edcc746dbceb8c9ac0175d8b6df104753710f7ded3ff66c45a6e2a65885634fa024408'"
200+
]
201+
},
202+
"execution_count": 198,
203+
"metadata": {},
204+
"output_type": "execute_result"
205+
}
206+
],
207+
"source": [
208+
"generate_download_signed_url_v4(projectId, \"user_space/export-{}-{}.parquet\".format(table_id, dateNow), credentials)"
209+
]
210+
},
211+
{
212+
"cell_type": "code",
213+
"execution_count": null,
214+
"id": "182f591f-5968-4407-9eba-e4620452f719",
215+
"metadata": {},
216+
"outputs": [],
217+
"source": []
218+
},
219+
{
220+
"cell_type": "code",
221+
"execution_count": null,
222+
"id": "b86dc402-dcfd-4145-bbc5-4af0c7270e1a",
223+
"metadata": {},
224+
"outputs": [],
225+
"source": []
226+
},
227+
{
228+
"cell_type": "code",
229+
"execution_count": null,
230+
"id": "7e8f1548-5475-46d6-86f3-d957d9fed89b",
231+
"metadata": {},
232+
"outputs": [],
233+
"source": []
234+
}
235+
],
236+
"metadata": {
237+
"kernelspec": {
238+
"display_name": "Python 3 (ipykernel)",
239+
"language": "python",
240+
"name": "python3"
241+
},
242+
"language_info": {
243+
"codemirror_mode": {
244+
"name": "ipython",
245+
"version": 3
246+
},
247+
"file_extension": ".py",
248+
"mimetype": "text/x-python",
249+
"name": "python",
250+
"nbconvert_exporter": "python",
251+
"pygments_lexer": "ipython3",
252+
"version": "3.9.6"
253+
}
254+
},
255+
"nbformat": 4,
256+
"nbformat_minor": 5
257+
}

0 commit comments

Comments
 (0)