Skip to content

Commit

Permalink
Merge leetcode contest dataset with leet10k (#2559)
Browse files Browse the repository at this point in the history
  • Loading branch information
ehartford authored May 1, 2023
1 parent 2d2fd6b commit fefdcbf
Showing 1 changed file with 36 additions and 4 deletions.
40 changes: 36 additions & 4 deletions data/datasets/oa_leet10k/oa_leet10k.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,11 @@
"import random\n",
"from IPython.display import display\n",
"from datasets import Dataset\n",
"import requests\n",
"\n",
"data_source = \"https://www.kaggle.com/datasets/erichartford/leetcode-solutions\"\n",
"lc_contests_data_source = \"https://github.com/Nan-Do/LeetCodeContestsDataset/raw/main/submissions.json\"\n",
"\n",
"output_dir = \"data\"\n",
"os.makedirs(output_dir, exist_ok=True)"
]
Expand All @@ -54,7 +57,12 @@
"metadata": {},
"outputs": [],
"source": [
"kaggle.api.dataset_download_files(\"erichartford/leetcode-solutions\", \"data\", unzip=True)"
"kaggle.api.dataset_download_files(\"erichartford/leetcode-solutions\", \"data\", unzip=True)\n",
"r = requests.get(lc_contests_data_source, allow_redirects=True)\n",
"with open(\"data/lc_contests.json\", \"wb\") as f:\n",
" for chunk in r.iter_content(chunk_size=1024):\n",
" if chunk:\n",
" f.write(chunk)"
]
},
{
Expand All @@ -64,6 +72,7 @@
"outputs": [],
"source": [
"leetcode_solutions = pd.read_json(\"data/leetcode-solutions.jsonl\", lines=True)\n",
"leetcode_contests = pd.read_json(\"data/lc_contests.json\")\n",
"\n",
"# Create dataframe with columns INSTRUCTION, RESPONSE, SOURCE\n",
"# The INSTRUCTION a random choice from ONE_STEP_TEMPLATES with the language and content filled in\n",
Expand All @@ -83,7 +92,21 @@
" \"SOURCE\": data_source,\n",
" }\n",
" )\n",
"\n",
"oa_leetcode_contests = []\n",
"for index, row in leetcode_contests.iterrows():\n",
" oa_leetcode_contests.append(\n",
" {\n",
" \"INSTRUCTION\": row[\"instruction\"] + \"\\n\" + row[\"input\"],\n",
" \"RESPONSE\": row[\"output\"],\n",
" \"SOURCE\": \"https://github.com/Nan-Do/LeetCodeContestsDataset\",\n",
" }\n",
" )\n",
"\n",
"oa_leet10k = pd.DataFrame(oa_leet10k)\n",
"oa_leetcode_contests = pd.DataFrame(oa_leetcode_contests)\n",
"\n",
"print(f\"oa_leet10k: {oa_leet10k.shape[0]}, oa_leetcode_contests: {oa_leetcode_contests.shape[0]}\")\n",
"\n",
"# Print the first 5 rows of the dataframe with full width and newline characters correctly displayed in the RESPONSE column\n",
"with pd.option_context(\"display.max_colwidth\", 80):\n",
Expand All @@ -94,7 +117,13 @@
" \"text-align\": \"left\",\n",
" \"white-space\": \"pre-wrap\",\n",
" }\n",
" )\n",
" ),\n",
" oa_leetcode_contests.head(5).style.set_properties(\n",
" **{\n",
" \"text-align\": \"left\",\n",
" \"white-space\": \"pre-wrap\",\n",
" }\n",
" ),\n",
" )"
]
},
Expand All @@ -106,9 +135,12 @@
"source": [
"# Upload dataset to HF\n",
"oa_leet10k.to_parquet(\"oa_leet10k.parquet\", row_group_size=100, engine=\"pyarrow\")\n",
"ds = Dataset.from_parquet(\"oa_leet10k.parquet\")\n",
"ds_leet10k = Dataset.from_parquet(\"oa_leet10k.parquet\")\n",
"oa_leetcode_contests.to_parquet(\"oa_leetcode_contests.parquet\", row_group_size=100, engine=\"pyarrow\")\n",
"ds_leetcode_contests = Dataset.from_parquet(\"oa_leetcode_contests.parquet\")\n",
"# Uncomment to push dataset to HF\n",
"# ds.push_to_hub(\"ehartford/oa_leet10k\")"
"# ds_leet10k.push_to_hub(\"ehartford/oa_leet10k\")\n",
"# ds_leetcode_contests.push_to_hub(\"ehartford/oa_leet10k\")"
]
}
],
Expand Down

0 comments on commit fefdcbf

Please sign in to comment.