Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
lijaan committed Jun 2, 2024
1 parent fdb4e02 commit de3ace9
Showing 1 changed file with 147 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -38,7 +38,7 @@
},
{
"cell_type": "code",
"execution_count": 44,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -91,7 +91,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -103,7 +103,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 6,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -322,6 +322,7 @@
"│ ├─additional_identifiers.csv.gz\n",
"│ ├─non_reg_addresses.csv.gz\n",
"│ └─alternative_names.csv.gz\n",
"├─additional_identifiers.parquet\n",
"├─pr/\n",
"│ ├─companies.csv.gz\n",
"│ ├─officers.csv.gz\n",
Expand All @@ -336,10 +337,16 @@
"│ └─alternative_names.csv.gz\n",
"├─us_ny/\n",
"│ ├─.DS_Store\n",
"│ ├─officers.parquet\n",
"│ ├─companies.csv.gz\n",
"│ ├─officers.csv.gz\n",
"│ ├─companies.csv\n",
"│ ├─additional_identifiers.parquet\n",
"│ ├─non_reg_addresses.parquet\n",
"│ ├─additional_identifiers.csv.gz\n",
"│ ├─companies.parquet\n",
"│ ├─tmp_companies.parquet\n",
"│ ├─alternative_names.parquet\n",
"│ ├─non_reg_addresses.csv.gz\n",
"│ └─alternative_names.csv.gz\n",
"├─us_az/\n",
Expand Down Expand Up @@ -492,7 +499,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -502,7 +509,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand All @@ -524,7 +531,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -534,7 +541,7 @@
},
{
"cell_type": "code",
"execution_count": 78,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand All @@ -543,7 +550,7 @@
"'id,company_number,jurisdiction_code,name,title,first_name,last_name,position,start_date,person_number,person_uid,end_date,current_status,occupation,nationality,country_of_residence,partial_date_of_birth,type,address.in_full,address.street_address,address.locality,address.region,address.postal_code,address.country,retrieved_at,source_url\\r\\n72342016,3332175,us_ny,WAJAD AZIMI,,,,agent,,,,,,,,,,,\"363 S BROADWAY, YONKERS, NEW YORK, 10705\",,,,,,2016-04-22 16:58:01 UTC,\\r\\n224809571,3785280,us_ny,BENEDETTA AMADI,,,,chief executive officer,,,,,,,,,,,\"BENEDETTA AMADI, BROOKLYN, NY, 11211\",BENEDETTA AMADI,BROOKLYN,NY,11211,,2024-05-13 11:37:21 UTC,\\r\\n224940888,2067108,us_ny,RICHARD P. SZMYR,,,,chief executive officer,,,,,,,,,,,\"RICHARD P. SZMYR, SCOTIA, NY, 12302\",RICHARD P. SZMYR,SCOTIA,NY,12302,,2024-05-06 11:39:24 UTC,\\r\\n225116861,2116899,us_ny,RON BUKSHPAN,,,,chief executive officer,,,,,,,,,,,\"RON BUKSHPAN, CUMMING, GA, 30041\",RON BUKSHPAN,CUMMING,GA,30041,,2024-05-06 11:39:24 UTC,\\r\\n'"
]
},
"execution_count": 78,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -554,7 +561,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -564,7 +571,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -574,7 +581,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -591,7 +598,7 @@
},
{
"cell_type": "code",
"execution_count": 97,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -628,20 +635,26 @@
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total 7582728\n",
"total 8942064\n",
"-rwxr--r-- 1 me staff 159K May 22 11:37 \u001b[31madditional_identifiers.csv.gz\u001b[m\u001b[m*\n",
"-rw-r--r-- 1 me staff 139K May 26 10:19 additional_identifiers.parquet\n",
"-rwxr--r-- 1 me staff 482K May 22 11:37 \u001b[31malternative_names.csv.gz\u001b[m\u001b[m*\n",
"-rw-r--r-- 1 me staff 415K May 26 10:43 alternative_names.parquet\n",
"-rwxr--r--@ 1 me staff 2.9G May 22 12:55 \u001b[31mcompanies.csv\u001b[m\u001b[m*\n",
"-rwxr--r--@ 1 me staff 438M May 22 11:38 \u001b[31mcompanies.csv.gz\u001b[m\u001b[m*\n",
"-rw-r--r-- 1 me staff 407M May 26 10:51 companies.parquet\n",
"-rwxr--r-- 1 me staff 18M May 22 11:38 \u001b[31mnon_reg_addresses.csv.gz\u001b[m\u001b[m*\n",
"-rwxr--r-- 1 me staff 235M May 22 11:38 \u001b[31mofficers.csv.gz\u001b[m\u001b[m*\n"
"-rw-r--r-- 1 me staff 16M May 26 10:27 non_reg_addresses.parquet\n",
"-rwxr--r-- 1 me staff 235M May 22 11:38 \u001b[31mofficers.csv.gz\u001b[m\u001b[m*\n",
"-rw-r--r-- 1 me staff 240M May 26 10:33 officers.parquet\n",
"-rw-r--r-- 1 me staff 0B May 26 10:49 tmp_companies.parquet\n"
]
}
],
Expand All @@ -658,7 +671,7 @@
},
{
"cell_type": "code",
"execution_count": 98,
"execution_count": 16,
"metadata": {},
"outputs": [
{
Expand All @@ -670,7 +683,7 @@
"<IPython.core.display.HTML object>"
]
},
"execution_count": 98,
"execution_count": 16,
"metadata": {
"text/html": {
"jupyter_ai": {
Expand All @@ -696,7 +709,121 @@
},
{
"cell_type": "code",
"execution_count": 99,
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"RuntimeError: (duckdb.duckdb.BinderException) Binder Error: Invalid named parameter \"decompress\" for function read_csv_auto\n",
"Candidates:\n",
" hive_types_autocast BOOLEAN\n",
" hive_types ANY\n",
" union_by_name BOOLEAN\n",
" filename BOOLEAN\n",
" dtypes ANY\n",
" null_padding BOOLEAN\n",
" parallel BOOLEAN\n",
" decimal_separator VARCHAR\n",
" buffer_size UBIGINT\n",
" all_varchar BOOLEAN\n",
" store_rejects BOOLEAN\n",
" names VARCHAR[]\n",
" compression VARCHAR\n",
" ignore_errors BOOLEAN\n",
" rejects_scan VARCHAR\n",
" quote VARCHAR\n",
" max_line_size VARCHAR\n",
" types ANY\n",
" skip BIGINT\n",
" column_types ANY\n",
" rejects_table VARCHAR\n",
" normalize_names BOOLEAN\n",
" nullstr ANY\n",
" auto_type_candidates ANY\n",
" sample_size BIGINT\n",
" auto_detect BOOLEAN\n",
" timestampformat VARCHAR\n",
" force_not_null VARCHAR[]\n",
" rejects_limit BIGINT\n",
" columns ANY\n",
" new_line VARCHAR\n",
" maximum_line_size VARCHAR\n",
" allow_quoted_nulls BOOLEAN\n",
" escape VARCHAR\n",
" header BOOLEAN\n",
" hive_partitioning BOOLEAN\n",
" sep VARCHAR\n",
" column_names VARCHAR[]\n",
" dateformat VARCHAR\n",
" delim VARCHAR\n",
"\n",
"LINE 7: FROM read_csv_auto('~/data/opencorporates/us_ny/additional_identifiers.csv.gz', header=True, decompress='gzip')\n",
")\n",
"TO '~/data/opencorporates/us_ny/additional_identifiers.parquet'\n",
"(FORMAT 'parquet', CODEC 'ZSTD');...\n",
" ^\n",
"[SQL: COPY (\n",
" SELECT\n",
" company_number,\n",
" jurisdiction_code,\n",
" uid,\n",
" identifier_system_code\n",
" FROM read_csv_auto('~/data/opencorporates/us_ny/additional_identifiers.csv.gz', header=True, decompress='gzip')\n",
")\n",
"TO '~/data/opencorporates/us_ny/additional_identifiers.parquet'\n",
"(FORMAT 'parquet', CODEC 'ZSTD');]\n",
"(Background on this error at: https://sqlalche.me/e/20/f405)\n",
"If you need help solving this issue, send us a message: https://ploomber.io/community\n"
]
}
],
"source": [
"%%sql\n",
"COPY (\n",
" SELECT \n",
" company_number,\n",
" jurisdiction_code,\n",
" uid,\n",
" identifier_system_code\n",
" FROM read_csv_auto('~/data/opencorporates/us_ny/additional_identifiers.csv.gz', header=True, decompress='gzip')\n",
")\n",
"TO '~/data/opencorporates/us_ny/additional_identifiers.parquet'\n",
"(FORMAT 'parquet', CODEC 'ZSTD');"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{10: 'id,company_number,jurisdiction_code,name,title,first_name,last_name,position,start_date,person_number,person_uid,end_date,current_status,occupation,nationality,country_of_residence,partial_date_of_birth,type,address.in_full,address.street_address,address.locality,address.region,address.postal_code,address.country,retrieved_at,source_url\\r\\n72342016,3332175,us_ny,WAJAD AZIMI,,,,agent,,,,,,,,,,,\"363 S BROADWAY, YONKERS, NEW YORK, 10705\",,,,,,2016-04-22 16:58:01 UTC,\\r\\n224809571,3785280,us_ny,BENEDETTA AMADI,,,,chief executive officer,,,,,,,,,,,\"BENEDETTA AMADI, BROOKLYN, NY, 11211\",BENEDETTA AMADI,BROOKLYN,NY,11211,,2024-05-13 11:37:21 UTC,\\r\\n224940888,2067108,us_ny,RICHARD P. SZMYR,,,,chief executive officer,,,,,,,,,,,\"RICHARD P. SZMYR, SCOTIA, NY, 12302\",RICHARD P. SZMYR,SCOTIA,NY,12302,,2024-05-06 11:39:24 UTC,\\r\\n225116861,2116899,us_ny,RON BUKSHPAN,,,,chief executive officer,,,,,,,,,,,\"RON BUKSHPAN, CUMMING, GA, 30041\",RON BUKSHPAN,CUMMING,GA,30041,,2024-05-06 11:39:24 UTC,\\r\\n',\n",
" 16: <IPython.core.display.HTML object>,\n",
" 18: Empty DataFrame\n",
" Columns: [Success]\n",
" Index: [],\n",
" 19: Empty DataFrame\n",
" Columns: [Success]\n",
" Index: [],\n",
" 23: <IPython.core.display.HTML object>}"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Out"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -734,7 +861,7 @@
"Index: []"
]
},
"execution_count": 99,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
Expand Down

0 comments on commit de3ace9

Please sign in to comment.