diff --git a/notebooks/load_and_visualize_opencorporates_corporate_entity_data_llc_corp.ipynb b/notebooks/load_and_visualize_opencorporates_corporate_entity_data_llc_corp.ipynb new file mode 100644 index 0000000..e00eca9 --- /dev/null +++ b/notebooks/load_and_visualize_opencorporates_corporate_entity_data_llc_corp.ipynb @@ -0,0 +1,1674 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction & Dependencies\n", + "\n", + "https://opencorporates.com/" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext jupyter_ai" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "%dotenv " + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Deploy Dash apps for free on Ploomber Cloud! Learn more: https://ploomber.io/s/signup\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/me/projects/new-york-real-estate/.venv/lib/python3.10/site-packages/sql/traits.py:20: FutureWarning: named_parameters: boolean values are now deprecated. Value True will be treated as \"enabled\". \n", + "Please use a valid option: \"warn\", \"enabled\", or \"disabled\". \n", + "For more information, see the docs: https://jupysql.ploomber.io/en/latest/api/configuration.html#named-parameters\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "# Load duckdb, which lets us efficiently load large files\n", + "import duckdb\n", + "\n", + "# Load pandas, which lets us manipulate dataframes\n", + "import pandas as pd\n", + "\n", + "# Import jupysql Jupyter extension to create SQL cells\n", + "%load_ext sql\n", + "\n", + "# Set configrations on jupysql to directly output data to Pandas and to simplify the output that is printed to the notebook.\n", + "%config SqlMagic.autopandas = True\n", + "\n", + "%config SqlMagic.feedback = False\n", + "%config SqlMagic.displaycon = False\n", + "\n", + "# Allow named parameters (python variables) in SQL cells\n", + "%config SqlMagic.named_parameters=True\n", + "\n", + "# Connect jupysql to DuckDB using a SQLAlchemy-style connection string. Either connect to an in memory DuckDB, or a file backed db.\n", + "%sql duckdb:///:memory:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test on one state: gather context for language model" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture path_structure\n", + "import seedir as sd\n", + "path = '~/data/opencorporates'\n", + "sd.seedir(path, style='lines', depthlimit=2, exclude_folders=['.git', '.ipynb_checkpoints'])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "opencorporates/\n", + "├─us_mi/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_mn/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_nh/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_al/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ky/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ak/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_mt/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ms/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ga/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ma/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_mo/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ut/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_sc/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─.DS_Store\n", + "├─us_sd/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_vt/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_va/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─ca_ns/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_tx/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─ca_bc/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_wy/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─ca/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─ca_nu/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ri/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_wv/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ca/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_co/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─ca_pe/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_fl/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ct/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ia/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ok/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_in/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─ca_qc/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ne/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ks/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─pr/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_md/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ny/\n", + "│ ├─.DS_Store\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─companies.csv\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_az/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_de/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_hi/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_me/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_nj/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_nm/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_nd/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_nc/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_dc/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_nv/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_ar/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_pa/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─ca_on/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─ca_nl/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─ca_nb/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_tn/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_wa/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─date.txt\n", + "├─us_wi/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_il/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_or/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─md5sum.txt\n", + "├─us_la/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "├─us_id/\n", + "│ ├─companies.csv.gz\n", + "│ ├─officers.csv.gz\n", + "│ ├─additional_identifiers.csv.gz\n", + "│ ├─non_reg_addresses.csv.gz\n", + "│ └─alternative_names.csv.gz\n", + "└─us_oh/\n", + " ├─companies.csv.gz\n", + " ├─officers.csv.gz\n", + " ├─additional_identifiers.csv.gz\n", + " ├─non_reg_addresses.csv.gz\n", + " └─alternative_names.csv.gz\n", + "\n" + ] + } + ], + "source": [ + "print(path_structure.stdout)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture ny_companies\n", + "!gzcat ~/data/opencorporates/us_ny/companies.csv.gz | head -n 5" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "company_number,jurisdiction_code,name,normalised_name,company_type,nonprofit,current_status,incorporation_date,dissolution_date,branch,business_number,current_alternative_legal_name,current_alternative_legal_name_language,home_jurisdiction_text,native_company_number,previous_names,retrieved_at,registry_url,restricted_for_marketing,inactive,accounts_next_due,accounts_reference_date,accounts_last_made_up_date,annual_return_next_due,annual_return_last_made_up_date,has_been_liquidated,has_insolvency_history,has_charges,number_of_employees,registered_address.street_address,registered_address.locality,registered_address.region,registered_address.postal_code,registered_address.country,registered_address.in_full,home_jurisdiction_code,home_jurisdiction_company_number,industry_code_uids,latest_accounts_date,latest_accounts_cash,latest_accounts_assets,latest_accounts_liabilities\n", + "1000000,us_ny,TAYLOR RAND LTD.,taylor rand limited,DOMESTIC BUSINESS CORPORATION,false,Inactive Dissolution By Proclamation / Annulment Of Authority,1985-05-28,1994-03-23,,,,,,,TAYLOR RAND LTD.,2016-07-05 07:01:09 UTC,https://appext20.dos.ny.gov/corp_public/CORPSEARCH.ENTITY_INFORMATION?p_nameid=0&p_corpid=1000000&p_entity_name=%25&p_name_type=%25&p_search_type=BEGINS&p_srch_results_page=0,,true,,,,,,,,,,\"CAMPOS & PAVLIDES PC, 186-09 UNION TPKE, FLUSHING, NEW YORK, 11366\",,,,United States,\"CAMPOS & PAVLIDES PC, 186-09 UNION TPKE, FLUSHING, NEW YORK, 11366, United States\",,,,,,,\n", + "1000001,us_ny,\"CRYSTAL BEACH ENTERPRISES, INC.\",crystal beach enterprises incorporated,DOMESTIC BUSINESS CORPORATION,false,Inactive Dissolution By Proclamation / Annulment Of Authority,1985-05-28,1992-06-24,,,,,NEW YORK,,\"CRYSTAL BEACH ENTERPRISES, INC.\",2016-07-05 07:01:09 UTC,https://appext20.dos.ny.gov/corp_public/CORPSEARCH.ENTITY_INFORMATION?p_nameid=0&p_corpid=1000001&p_entity_name=%25&p_name_type=%25&p_search_type=BEGINS&p_srch_results_page=0,,true,,,,,,,,,,\"%JOHN GIARDINO, 181 FRANKLIN ST, BUFFALO, NEW YORK, 14202\",,,,United States,\"%JOHN GIARDINO, 181 FRANKLIN ST, BUFFALO, NEW YORK, 14202, United States\",,,,,,,\n", + "1000002,us_ny,\"PAUL'S GIRL, INC.\",pauls girl incorporated,DOMESTIC BUSINESS CORPORATION,false,Inactive Dissolution By Proclamation / Annulment Of Authority,1985-05-28,1993-03-24,,,,,NEW YORK,,\"PAUL'S GIRL, INC.\",2016-07-05 07:01:11 UTC,https://appext20.dos.ny.gov/corp_public/CORPSEARCH.ENTITY_INFORMATION?p_nameid=0&p_corpid=1000002&p_entity_name=%25&p_name_type=%25&p_search_type=BEGINS&p_srch_results_page=0,,true,,,,,,,,,,\"ARTHUR GINS, 501 SEVENTH AVE, NEW YORK, NEW YORK, 10018\",,,,United States,\"ARTHUR GINS, 501 SEVENTH AVE, NEW YORK, NEW YORK, 10018, United States\",,,,,,,\n", + "1000003,us_ny,\"NORMAN'S TEEN TOURS, INC.\",normans teen tours incorporated,DOMESTIC BUSINESS CORPORATION,false,Inactive Dissolution By Proclamation / Annulment Of Authority,1985-05-28,1991-09-25,,,,,NEW YORK,,\"NORMAN'S TEEN TOURS, INC.|REIN TEEN TOURS, INC.\",2016-07-05 07:01:08 UTC,https://appext20.dos.ny.gov/corp_public/CORPSEARCH.ENTITY_INFORMATION?p_nameid=0&p_corpid=1000003&p_entity_name=%25&p_name_type=%25&p_search_type=BEGINS&p_srch_results_page=0,,true,,,,,,,,,,\"REIN TEEN TOURS, INC., 206-11 LORI DRIVE, BAYSIDE, NEW YORK, 11361\",,,,United States,\"REIN TEEN TOURS, INC., 206-11 LORI DRIVE, BAYSIDE, NEW YORK, 11361, United States\",,,,,,,\n", + "\n" + ] + } + ], + "source": [ + "print(ny_companies.stdout)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture ny_officers\n", + "!gzcat ~/data/opencorporates/us_ny/officers.csv.gz | head -n 5" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'id,company_number,jurisdiction_code,name,title,first_name,last_name,position,start_date,person_number,person_uid,end_date,current_status,occupation,nationality,country_of_residence,partial_date_of_birth,type,address.in_full,address.street_address,address.locality,address.region,address.postal_code,address.country,retrieved_at,source_url\\r\\n72342016,3332175,us_ny,WAJAD AZIMI,,,,agent,,,,,,,,,,,\"363 S BROADWAY, YONKERS, NEW YORK, 10705\",,,,,,2016-04-22 16:58:01 UTC,\\r\\n224809571,3785280,us_ny,BENEDETTA AMADI,,,,chief executive officer,,,,,,,,,,,\"BENEDETTA AMADI, BROOKLYN, NY, 11211\",BENEDETTA AMADI,BROOKLYN,NY,11211,,2024-05-13 11:37:21 UTC,\\r\\n224940888,2067108,us_ny,RICHARD P. SZMYR,,,,chief executive officer,,,,,,,,,,,\"RICHARD P. SZMYR, SCOTIA, NY, 12302\",RICHARD P. SZMYR,SCOTIA,NY,12302,,2024-05-06 11:39:24 UTC,\\r\\n225116861,2116899,us_ny,RON BUKSHPAN,,,,chief executive officer,,,,,,,,,,,\"RON BUKSHPAN, CUMMING, GA, 30041\",RON BUKSHPAN,CUMMING,GA,30041,,2024-05-06 11:39:24 UTC,\\r\\n'" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ny_officers.stdout" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture ny_additional_identifiers\n", + "!gzcat ~/data/opencorporates/us_ny/additional_identifiers.csv.gz | head -n 5" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture ny_non_reg_addresses\n", + "!gzcat ~/data/opencorporates/us_ny/non_reg_addresses.csv.gz | head -n 5" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture ny_alternative_names\n", + "!gzcat ~/data/opencorporates/us_ny/alternative_names.csv.gz | head -n 5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Design the prompt that can be re-used for every file we want to load and visualize" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "parent_prompt = f\"\"\"\n", + "Please take the following context for the directory structure: \n", + "\n", + "{path_structure.stdout}\n", + "\n", + "Then consider only the `ny` folder, with the following files and headers:\n", + "\n", + "`companies.csv.gz`: \n", + "\n", + "{ny_companies.stdout}\n", + "\n", + "`officers.csv.gz`:\n", + "\n", + "{ny_officers.stdout}\n", + "\n", + "`additional_identifiers.csv.gz`:\n", + "\n", + "{ny_additional_identifiers.stdout}\n", + "\n", + "`alternative_names.csv.gz`:\n", + "\n", + "{ny_alternative_names.stdout}\n", + "\n", + "Proceed step-by-step to copy the resulting database into a parquet file compressed with ZSTD compression, using the duckdb dialect of SQL, for future use in a dbt model, in the `~/data/opencorporates` directory, sharded in the same way (by state), only for the ny state.\n", + "\n", + "Remember to also proceed step-by-step as an elite site reliability/devops/L20 principal warez engineer at google, returning as few tokens as possible, to debug this SQL code. Give the complete corrected code!\n", + "\n", + "Please always remember to prefix the output with `%%sql` for the JupySQL cell magic :)\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 7582728\n", + "-rwxr--r-- 1 me staff 159K May 22 11:37 \u001b[31madditional_identifiers.csv.gz\u001b[m\u001b[m*\n", + "-rwxr--r-- 1 me staff 482K May 22 11:37 \u001b[31malternative_names.csv.gz\u001b[m\u001b[m*\n", + "-rwxr--r--@ 1 me staff 2.9G May 22 12:55 \u001b[31mcompanies.csv\u001b[m\u001b[m*\n", + "-rwxr--r--@ 1 me staff 438M May 22 11:38 \u001b[31mcompanies.csv.gz\u001b[m\u001b[m*\n", + "-rwxr--r-- 1 me staff 18M May 22 11:38 \u001b[31mnon_reg_addresses.csv.gz\u001b[m\u001b[m*\n", + "-rwxr--r-- 1 me staff 235M May 22 11:38 \u001b[31mofficers.csv.gz\u001b[m\u001b[m*\n" + ] + } + ], + "source": [ + "!ls -lh ~/data/opencorporates/us_ny" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Execute the prompt once for every file" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "AI generated code inserted below ⬇️" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 98, + "metadata": { + "text/html": { + "jupyter_ai": { + "model_id": "claude-3-opus-20240229", + "provider_id": "anthropic-chat" + } + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "%%ai anthropic-chat:claude-3-opus-20240229 --format code\n", + "\n", + "{parent_prompt}\n", + "\n", + "Only do this for this file in `~/data/opencorporates/us_ny`:\n", + "\n", + "```\n", + "-rwxr--r-- 1 me staff 159K May 22 11:37 additional_identifiers.csv.gz*\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Success
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Success]\n", + "Index: []" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "COPY\n", + "(\n", + " SELECT\n", + " company_number,\n", + " jurisdiction_code,\n", + " uid,\n", + " identifier_system_code\n", + " FROM read_csv_auto('~/data/opencorporates/us_ny/additional_identifiers.csv.gz', header=True, sep=',')\n", + ")\n", + "TO '~/data/opencorporates/additional_identifiers.parquet'\n", + "(FORMAT 'PARQUET', CODEC 'ZSTD');" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "AI generated code inserted below ⬇️" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 100, + "metadata": { + "text/html": { + "jupyter_ai": { + "model_id": "claude-3-opus-20240229", + "provider_id": "anthropic-chat" + } + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "%%ai anthropic-chat:claude-3-opus-20240229 --format code\n", + "\n", + "{parent_prompt}\n", + "\n", + "Only do this for this file in `~/data/opencorporates/us_ny`:\n", + "\n", + "```\n", + "-rwxr--r-- 1 me staff 482K May 22 11:37 alternative_names.csv.gz*\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Success
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Success]\n", + "Index: []" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "CREATE TABLE ny_alternative_names AS \n", + "SELECT * \n", + "FROM read_csv_auto('~/data/opencorporates/us_ny/alternative_names.csv.gz', header=True, sep=',');\n", + "\n", + "COPY (\n", + " SELECT *\n", + " FROM ny_alternative_names\n", + ") TO '~/data/opencorporates/us_ny/alternative_names.parquet' (FORMAT 'parquet', CODEC 'ZSTD');" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "AI generated code inserted below ⬇️" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 115, + "metadata": { + "text/html": { + "jupyter_ai": { + "model_id": "claude-3-opus-20240229", + "provider_id": "anthropic-chat" + } + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "%%ai anthropic-chat:claude-3-opus-20240229 --format code\n", + "\n", + "{parent_prompt}\n", + "\n", + "Only do this for this file in `~/data/opencorporates/us_ny`:\n", + "\n", + "```\n", + "-rwxr--r--@ 1 me staff 2.9G May 22 12:55 companies.csv.gz\n", + "```\n", + "\n", + "Remember that if a column name contains a period, then you need to use double quotes around the entire column name. Do NOT use the `read_csv_auto` function and instead specify the type of every column in the `read_csv` options and parameters.\n", + "\n", + "Do NOT use the read_csv_auto function, and specify all the columns in the read_csv parameter. Only give the final code and do not load any duckdb extensions nor create tables; one copy command suffices :)\n", + "\n", + "Instead of the `read_csv_auto` function, remember to use the `read_csv` function and specify all types correctly with the `columns` parameter :) \n", + "Instead of the `read_csv_auto` function, remember to use the `read_csv` function and specify all types correctly with the `columns` parameter :) \n", + "Instead of the `read_csv_auto` function, remember to use the `read_csv` function and specify all types correctly with the `columns` parameter :) \n", + "Instead of the `read_csv_auto` function, remember to use the `read_csv` function and specify all types correctly with the `columns` parameter :) " + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8824e2f59d6847258404942a657e4720", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Success
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Success]\n", + "Index: []" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "COPY (\n", + " SELECT\n", + " company_number,\n", + " jurisdiction_code,\n", + " name,\n", + " normalised_name,\n", + " company_type,\n", + " nonprofit,\n", + " current_status,\n", + " incorporation_date,\n", + " dissolution_date,\n", + " branch,\n", + " business_number,\n", + " current_alternative_legal_name,\n", + " current_alternative_legal_name_language,\n", + " home_jurisdiction_text,\n", + " native_company_number,\n", + " previous_names,\n", + " retrieved_at,\n", + " registry_url,\n", + " restricted_for_marketing,\n", + " inactive,\n", + " accounts_next_due,\n", + " accounts_reference_date,\n", + " accounts_last_made_up_date,\n", + " annual_return_next_due,\n", + " annual_return_last_made_up_date,\n", + " has_been_liquidated,\n", + " has_insolvency_history,\n", + " has_charges,\n", + " number_of_employees,\n", + " \"registered_address.street_address\",\n", + " \"registered_address.locality\",\n", + " \"registered_address.region\",\n", + " \"registered_address.postal_code\",\n", + " \"registered_address.country\",\n", + " \"registered_address.in_full\",\n", + " home_jurisdiction_code,\n", + " home_jurisdiction_company_number,\n", + " industry_code_uids,\n", + " latest_accounts_date,\n", + " latest_accounts_cash,\n", + " latest_accounts_assets,\n", + " latest_accounts_liabilities\n", + " FROM read_csv(\n", + " '~/data/opencorporates/us_ny/companies.csv.gz',\n", + " columns={\n", + " 'company_number': 'VARCHAR',\n", + " 'jurisdiction_code': 'VARCHAR',\n", + " 'name': 'VARCHAR',\n", + " 'normalised_name': 'VARCHAR',\n", + " 'company_type': 'VARCHAR',\n", + " 'nonprofit': 'BOOLEAN',\n", + " 'current_status': 'VARCHAR',\n", + " 'incorporation_date': 'DATE',\n", + " 'dissolution_date': 'DATE',\n", + " 'branch': 'VARCHAR',\n", + " 'business_number': 'VARCHAR',\n", + " 'current_alternative_legal_name': 'VARCHAR',\n", + " 'current_alternative_legal_name_language': 'VARCHAR',\n", + " 'home_jurisdiction_text': 'VARCHAR',\n", + " 'native_company_number': 'VARCHAR',\n", + " 'previous_names': 'VARCHAR',\n", + " 'retrieved_at': 'TIMESTAMP',\n", + " 'registry_url': 'VARCHAR',\n", + " 'restricted_for_marketing': 'BOOLEAN',\n", + " 'inactive': 'BOOLEAN',\n", + " 'accounts_next_due': 'DATE',\n", + " 'accounts_reference_date': 'DATE',\n", + " 'accounts_last_made_up_date': 'DATE',\n", + " 'annual_return_next_due': 'DATE', \n", + " 'annual_return_last_made_up_date': 'DATE',\n", + " 'has_been_liquidated': 'BOOLEAN',\n", + " 'has_insolvency_history': 'BOOLEAN',\n", + " 'has_charges': 'BOOLEAN',\n", + " 'number_of_employees': 'INTEGER',\n", + " 'registered_address.street_address': 'VARCHAR',\n", + " 'registered_address.locality': 'VARCHAR',\n", + " 'registered_address.region': 'VARCHAR',\n", + " 'registered_address.postal_code': 'VARCHAR',\n", + " 'registered_address.country': 'VARCHAR',\n", + " 'registered_address.in_full': 'VARCHAR',\n", + " 'home_jurisdiction_code': 'VARCHAR',\n", + " 'home_jurisdiction_company_number': 'VARCHAR',\n", + " 'industry_code_uids': 'VARCHAR',\n", + " 'latest_accounts_date': 'DATE',\n", + " 'latest_accounts_cash': 'DOUBLE',\n", + " 'latest_accounts_assets': 'DOUBLE',\n", + " 'latest_accounts_liabilities': 'DOUBLE'\n", + " }\n", + " )\n", + ")\n", + "TO '~/data/opencorporates/us_ny/companies.parquet'\n", + "(FORMAT PARQUET, CODEC ZSTD, ROW_GROUP_SIZE 100000);" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "AI generated code inserted below ⬇️" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 120, + "metadata": { + "text/html": { + "jupyter_ai": { + "model_id": "claude-3-opus-20240229", + "provider_id": "anthropic-chat" + } + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "%%ai anthropic-chat:claude-3-opus-20240229 --format code\n", + "\n", + "{parent_prompt}\n", + "\n", + "Only do this for this file in `~/data/opencorporates/us_ny`:\n", + "\n", + "```\n", + "-rwxr--r-- 1 me staff 18M May 22 11:38 non_reg_addresses.csv.gz*\n", + "```\n", + "\n", + "Make sure to only use the correct columns that actually appear in the header you have been provided. And make sure to specify the read_csv options manually rather than use the auto one. (do NOT use the `read_csv_auto` function!)." + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Success
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Success]\n", + "Index: []" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "COPY (\n", + "SELECT * \n", + "FROM read_csv('~/data/opencorporates/us_ny/non_reg_addresses.csv.gz',\n", + " auto_detect=False,\n", + " delim=',',\n", + " null_padding=true,\n", + " header=True,\n", + " columns={\n", + " 'company_number': 'VARCHAR',\n", + " 'jurisdiction_code': 'VARCHAR',\n", + " 'value': 'VARCHAR',\n", + " 'service_address': 'VARCHAR',\n", + " 'address.in_full': 'VARCHAR',\n", + " 'address.street_address': 'VARCHAR',\n", + " 'address.locality': 'VARCHAR',\n", + " 'address.region': 'VARCHAR',\n", + " 'address.postal_code': 'VARCHAR',\n", + " 'address.country': 'VARCHAR',\n", + " 'type': 'VARCHAR',\n", + " 'start_date': 'DATE',\n", + " 'end_date': 'DATE'\n", + " }\n", + ")\n", + " ) TO '~/data/opencorporates/us_ny/non_reg_addresses.parquet' (FORMAT 'parquet', CODEC 'ZSTD');" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "AI generated code inserted below ⬇️" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 82, + "metadata": { + "text/html": { + "jupyter_ai": { + "model_id": "claude-3-opus-20240229", + "provider_id": "anthropic-chat" + } + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "%%ai anthropic-chat:claude-3-opus-20240229 --format code\n", + "\n", + "{parent_prompt}\n", + "\n", + "Only do this for this file in `~/data/opencorporates/us_ny`:\n", + "\n", + "```\n", + "-rwxr--r-- 1 me staff 235M May 22 11:38 officers.csv.gz*\n", + "```\n", + "\n", + "Remember that if a column name contains a period, then you need to use double quotes around the entire column name. Do NOT use the `read_csv_auto` function and instead specify the type of every column in the `read_csv` options and parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5f51f9f0480144d88039c5245927dea6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a099a147c4964d089ca803e354037739", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Success
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Success]\n", + "Index: []" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "CREATE TABLE officers AS \n", + "SELECT \n", + " id::INTEGER,\n", + " company_number,\n", + " jurisdiction_code,\n", + " name,\n", + " title,\n", + " first_name, \n", + " last_name,\n", + " position,\n", + " start_date::DATE,\n", + " person_number,\n", + " person_uid,\n", + " end_date::DATE,\n", + " current_status,\n", + " occupation,\n", + " nationality,\n", + " country_of_residence,\n", + " partial_date_of_birth::DATE,\n", + " type,\n", + " \"address.in_full\",\n", + " \"address.street_address\",\n", + " \"address.locality\",\n", + " \"address.region\",\n", + " \"address.postal_code\",\n", + " \"address.country\",\n", + " retrieved_at::TIMESTAMP,\n", + " source_url\n", + "FROM read_csv_auto('~/data/opencorporates/us_ny/officers.csv.gz', delim=',', header=True, columns={\n", + " 'id': 'INT',\n", + " 'company_number': 'VARCHAR',\n", + " 'jurisdiction_code': 'VARCHAR',\n", + " 'name': 'VARCHAR',\n", + " 'title': 'VARCHAR',\n", + " 'first_name': 'VARCHAR',\n", + " 'last_name': 'VARCHAR',\n", + " 'position': 'VARCHAR',\n", + " 'start_date': 'DATE',\n", + " 'person_number': 'VARCHAR',\n", + " 'person_uid': 'VARCHAR',\n", + " 'end_date': 'DATE',\n", + " 'current_status': 'VARCHAR',\n", + " 'occupation': 'VARCHAR',\n", + " 'nationality': 'VARCHAR',\n", + " 'country_of_residence': 'VARCHAR',\n", + " 'partial_date_of_birth': 'DATE',\n", + " 'type': 'VARCHAR',\n", + " 'address.in_full': 'VARCHAR',\n", + " 'address.street_address': 'VARCHAR', \n", + " 'address.locality': 'VARCHAR',\n", + " 'address.region': 'VARCHAR',\n", + " 'address.postal_code': 'VARCHAR',\n", + " 'address.country': 'VARCHAR',\n", + " 'retrieved_at': 'TIMESTAMP',\n", + " 'source_url': 'VARCHAR'\n", + "});\n", + "\n", + "COPY officers TO '~/data/opencorporates/us_ny/officers.parquet' (FORMAT 'parquet', CODEC 'ZSTD');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prototype visualizations and network analyses" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture file_list\n", + "!ls ~/data/opencorporates/us_ny/*.parquet" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture officers_parquet\n", + "!duckdb -markdown -c \"SELECT * FROM '~/data/opencorporates/us_ny/officers.parquet' LIMIT 10;\"" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture non_reg_addresses_parquet\n", + "!duckdb -markdown -c \"SELECT * FROM '~/data/opencorporates/us_ny/non_reg_addresses.parquet' LIMIT 10;\"" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture companies_parquet\n", + "!duckdb -markdown -c \"SELECT * FROM '~/data/opencorporates/us_ny/companies.parquet' LIMIT 10;\"" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture alternative_names_parquet\n", + "!duckdb -markdown -c \"SELECT * FROM '~/data/opencorporates/us_ny/alternative_names.parquet' LIMIT 10;\"" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture docs\n", + "\n", + "!curl {https://uwdata.github.io/mosaic/jupyter/}" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture example\n", + "\n", + "!curl {https://pastebin.com/raw/XijHp75S}" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": {}, + "outputs": [], + "source": [ + "prompt = f\"\"\"\n", + "Take the following as context: \n", + "\n", + "```\n", + "!duckdb -markdown -c \"SELECT * FROM '~/data/opencorporates/us_ny/officers.parquet' LIMIT 10;\"\n", + "\n", + "{officers_parquet.stdout}\n", + "```\n", + "\n", + "```\n", + "!duckdb -markdown -c \"SELECT * FROM '~/data/opencorporates/us_ny/non_reg_addresses.parquet' LIMIT 10;\"\n", + "\n", + "{non_reg_addresses_parquet.stdout}\n", + "```\n", + "\n", + "```\n", + "!duckdb -markdown -c \"SELECT * FROM '~/data/opencorporates/us_ny/companies.parquet' LIMIT 10;\"\n", + "\n", + "{companies_parquet.stdout}\n", + "```\n", + "\n", + "```\n", + "!duckdb -markdown -c \"SELECT * FROM '~/data/opencorporates/us_ny/alternative_names.parquet' LIMIT 10;\"\n", + "\n", + "{alternative_names_parquet.stdout}\n", + "```\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "AI generated code inserted below ⬇️" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 160, + "metadata": { + "text/html": { + "jupyter_ai": { + "model_id": "claude-3-opus-20240229", + "provider_id": "anthropic-chat" + } + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "%%ai anthropic-chat:claude-3-opus-20240229 --format code\n", + "\n", + "{prompt}\n", + "\n", + "Use the Jupyter widget and the above example to visualize what is reasonable in the companies.parquet file and header. debug it. proceed step-by-step as an elite site reliability/devops/L20 principal warez engineer at google, returning as few tokens as possible, to debug this visualization code using the reference. give the complete corrected code!" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7f3ee488ff3c4122afa1cf67b9f09e61", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "21c72dc46a364d189ce1e71821cb4e55", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dropdown(description='Jurisdiction:', options=('us_ny',), value='us_ny')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3f2a9d4ec4d44e70af0cc7f33ffab838", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "IntText(value=10, description='Limit:')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b84c2b407f53414bb8070bfc12c2a86f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Button(description='Query', style=ButtonStyle())" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3654da5fd1b44ca59c2d23b301e1e0a1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import ipywidgets as widgets\n", + "import duckdb\n", + "\n", + "parquet_file = \"~/data/opencorporates/us_ny/companies.parquet\"\n", + "conn = duckdb.connect()\n", + "\n", + "# Read the parquet file into a DuckDB table\n", + "conn.execute(f\"CREATE TABLE companies AS SELECT * FROM read_parquet('{parquet_file}')\")\n", + "\n", + "# Get the table header\n", + "header = conn.execute(\"SELECT * FROM companies LIMIT 0\").description\n", + "header = [col[0] for col in header]\n", + "\n", + "# Get the list of company jurisdictions\n", + "jurisdictions = conn.execute(\"SELECT DISTINCT jurisdiction_code FROM companies ORDER BY jurisdiction_code\").fetchall()\n", + "jurisdictions = [j[0] for j in jurisdictions]\n", + "\n", + "# Create widgets\n", + "jurisdiction_dropdown = widgets.Dropdown(options=jurisdictions, description='Jurisdiction:', value=jurisdictions[0])\n", + "limit_text = widgets.IntText(value=10, description='Limit:', min=1, max=1000)\n", + "\n", + "# Define the query function\n", + "def query_companies(jurisdiction, limit):\n", + " query = f\"SELECT * FROM companies WHERE jurisdiction_code = '{jurisdiction}' LIMIT {limit}\"\n", + " return conn.execute(query).fetchdf()\n", + "\n", + "# Create the output widget\n", + "output = widgets.Output()\n", + "\n", + "# Define the on_click function\n", + "def on_click(b):\n", + " with output:\n", + " output.clear_output()\n", + " df = query_companies(jurisdiction_dropdown.value, limit_text.value)\n", + " print(df.to_markdown(index=False))\n", + "\n", + "# Create the button widget\n", + "button = widgets.Button(description=\"Query\")\n", + "button.on_click(on_click)\n", + "\n", + "# Display the widgets\n", + "display(jurisdiction_dropdown, limit_text, button, output)" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Invalid format specifier", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[163], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_cell_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mai\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43manthropic-chat:claude-3-opus-20240229 --format code\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;132;43;01m{prompt}\u001b[39;49;00m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;124;43mUse the Jupyter widget and the above example to visualize what is reasonable in the companies.parquet file and header. debug it. proceed step-by-step as an elite site reliability/devops/L20 principal warez engineer at google, returning as few tokens as possible, to debug this visualization code using the reference. give the complete corrected code!\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;124;43mHere is the documentation for the Jupyter widget we will be using for visualization: from https://uwdata.github.io/mosaic/jupyter/\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;132;43;01m{example_raw}\u001b[39;49;00m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/projects/new-york-real-estate/.venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py:2517\u001b[0m, in \u001b[0;36mInteractiveShell.run_cell_magic\u001b[0;34m(self, magic_name, line, cell)\u001b[0m\n\u001b[1;32m 2515\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[1;32m 2516\u001b[0m args \u001b[38;5;241m=\u001b[39m (magic_arg_s, cell)\n\u001b[0;32m-> 2517\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2519\u001b[0m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[1;32m 2520\u001b[0m \u001b[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001b[39;00m\n\u001b[1;32m 2521\u001b[0m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[1;32m 2522\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic\u001b[38;5;241m.\u001b[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n", + "File \u001b[0;32m~/projects/new-york-real-estate/.venv/lib/python3.10/site-packages/jupyter_ai_magics/magics.py:618\u001b[0m, in \u001b[0;36mAiMagics.ai\u001b[0;34m(self, line, cell)\u001b[0m\n\u001b[1;32m 615\u001b[0m ip \u001b[38;5;241m=\u001b[39m get_ipython()\n\u001b[1;32m 616\u001b[0m prompt \u001b[38;5;241m=\u001b[39m prompt\u001b[38;5;241m.\u001b[39mformat_map(FormatDict(ip\u001b[38;5;241m.\u001b[39muser_ns))\n\u001b[0;32m--> 618\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_ai_cell\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/projects/new-york-real-estate/.venv/lib/python3.10/site-packages/jupyter_ai_magics/magics.py:556\u001b[0m, in \u001b[0;36mAiMagics.run_ai_cell\u001b[0;34m(self, args, prompt)\u001b[0m\n\u001b[1;32m 554\u001b[0m \u001b[38;5;66;03m# interpolate user namespace into prompt\u001b[39;00m\n\u001b[1;32m 555\u001b[0m ip \u001b[38;5;241m=\u001b[39m get_ipython()\n\u001b[0;32m--> 556\u001b[0m prompt \u001b[38;5;241m=\u001b[39m \u001b[43mprompt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mformat_map\u001b[49m\u001b[43m(\u001b[49m\u001b[43mFormatDict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mip\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muser_ns\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 558\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m provider\u001b[38;5;241m.\u001b[39mis_chat_provider:\n\u001b[1;32m 559\u001b[0m result \u001b[38;5;241m=\u001b[39m provider\u001b[38;5;241m.\u001b[39mgenerate([[HumanMessage(content\u001b[38;5;241m=\u001b[39mprompt)]])\n", + "\u001b[0;31mValueError\u001b[0m: Invalid format specifier" + ] + } + ], + "source": [ + "%%ai anthropic-chat:claude-3-opus-20240229 --format code\n", + "\n", + "{prompt}\n", + "\n", + "Use the Jupyter widget and the above example to visualize what is reasonable in the companies.parquet file and header. debug it. proceed step-by-step as an elite site reliability/devops/L20 principal warez engineer at google, returning as few tokens as possible, to debug this visualization code using the reference. give the complete corrected code!\n", + "\n", + "Here is the documentation for the Jupyter widget we will be using for visualization: from https://uwdata.github.io/mosaic/jupyter/\n", + "\n", + "{example_raw}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/requirements.in b/requirements.in index 6f6f173..b3266d1 100644 --- a/requirements.in +++ b/requirements.in @@ -28,4 +28,8 @@ matplotlib altair vega_datasets pip-tools -psutil \ No newline at end of file +psutil +seedir +mosaic-widget +tabulate +great_tables \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 3a90648..8d5f032 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile @@ -25,6 +25,8 @@ anyio==4.3.0 # anthropic # httpx # jupyter-server +anywidget==0.9.11 + # via mosaic-widget appdirs==1.4.4 # via sqlfluff appnope==0.1.4 @@ -52,6 +54,7 @@ attrs==23.2.0 babel==2.15.0 # via # agate + # great-tables # jupyterlab-server backoff==2.2.1 # via posthog @@ -112,6 +115,8 @@ comm==0.2.2 # via # ipykernel # ipywidgets +commonmark==0.9.1 + # via great-tables contourpy==1.2.1 # via matplotlib cycler==0.12.1 @@ -152,6 +157,7 @@ duckdb==0.10.2 # via # dbt-duckdb # duckdb-engine + # mosaic-widget duckdb-engine==0.12.0 # via -r requirements.in exceptiongroup==1.2.1 @@ -183,12 +189,16 @@ fsspec==2024.3.1 # huggingface-hub geopandas==0.14.4 # via -r requirements.in +great-tables==0.6.1 + # via -r requirements.in greenlet==3.0.3 # via # playwright # sqlalchemy h11==0.14.0 # via httpcore +htmltools==0.5.2 + # via great-tables httpcore==1.0.5 # via httpx httpx==0.27.0 @@ -207,19 +217,13 @@ idna==3.7 # yarl importlib-metadata==6.11.0 # via - # build # dask # dbt-semantic-interfaces - # fiona + # great-tables # jupyter-ai # jupyter-ai-magics - # jupyter-client - # jupyter-lsp - # jupyterlab - # jupyterlab-server - # nbconvert importlib-resources==6.4.0 - # via matplotlib + # via great-tables iniconfig==2.0.0 # via pytest ipykernel==6.29.4 @@ -240,6 +244,7 @@ ipython-genutils==0.2.0 ipywidgets==8.1.2 # via # -r requirements.in + # anywidget # jupyter isodate==0.6.1 # via @@ -394,6 +399,8 @@ monotonic==1.6 # via posthog more-itertools==10.2.0 # via dbt-semantic-interfaces +mosaic-widget==0.8.0 + # via -r requirements.in msgpack==1.0.8 # via # distributed @@ -404,6 +411,8 @@ multidict==6.0.5 # yarl mypy-extensions==1.0.0 # via typing-inspect +natsort==8.4.0 + # via seedir nbclient==0.10.0 # via nbconvert nbconvert==7.16.4 @@ -432,6 +441,7 @@ numpy==1.26.4 # contourpy # faiss-cpu # geopandas + # great-tables # langchain # langchain-community # matplotlib @@ -452,6 +462,7 @@ packaging==23.2 # distributed # duckdb-engine # geopandas + # htmltools # huggingface-hub # ipykernel # jupyter-server @@ -523,6 +534,8 @@ psutil==5.9.8 # -r requirements.in # distributed # ipykernel +psygnal==0.11.1 + # via anywidget ptyprocess==0.7.0 # via # pexpect @@ -530,7 +543,9 @@ ptyprocess==0.7.0 pure-eval==0.2.2 # via stack-data pyarrow==16.0.0 - # via -r requirements.in + # via + # -r requirements.in + # mosaic-widget pycparser==2.22 # via cffi pydantic==2.7.1 @@ -639,6 +654,8 @@ ruff==0.4.3 # via -r requirements.in seaborn==0.13.2 # via -r requirements.in +seedir==0.4.2 + # via -r requirements.in send2trash==1.8.3 # via jupyter-server shapely==2.0.4 @@ -678,6 +695,8 @@ sqlparse==0.5.0 # jupysql stack-data==0.6.3 # via ipython +tabulate==0.9.0 + # via -r requirements.in tblib==3.0.0 # via # distributed @@ -751,11 +770,13 @@ typing-extensions==4.11.0 # altair # anthropic # anyio + # anywidget # async-lru # dbt-core # dbt-semantic-interfaces + # great-tables + # htmltools # huggingface-hub - # ipython # jupyter-ai # jupyter-ai-magics # mashumaro @@ -799,9 +820,7 @@ yarl==1.9.4 zict==3.0.0 # via distributed zipp==3.18.1 - # via - # importlib-metadata - # importlib-resources + # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: # pip