From fefdcbf7596c36a99ea20097038c7f65cb6fb403 Mon Sep 17 00:00:00 2001 From: Eric Hartford Date: Mon, 1 May 2023 01:57:09 -0700 Subject: [PATCH] Merge leetcode contest dataset with leet10k (#2559) --- data/datasets/oa_leet10k/oa_leet10k.ipynb | 40 ++++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/data/datasets/oa_leet10k/oa_leet10k.ipynb b/data/datasets/oa_leet10k/oa_leet10k.ipynb index da5de42d82..796a9e343f 100644 --- a/data/datasets/oa_leet10k/oa_leet10k.ipynb +++ b/data/datasets/oa_leet10k/oa_leet10k.ipynb @@ -42,8 +42,11 @@ "import random\n", "from IPython.display import display\n", "from datasets import Dataset\n", + "import requests\n", "\n", "data_source = \"https://www.kaggle.com/datasets/erichartford/leetcode-solutions\"\n", + "lc_contests_data_source = \"https://github.com/Nan-Do/LeetCodeContestsDataset/raw/main/submissions.json\"\n", + "\n", "output_dir = \"data\"\n", "os.makedirs(output_dir, exist_ok=True)" ] @@ -54,7 +57,12 @@ "metadata": {}, "outputs": [], "source": [ - "kaggle.api.dataset_download_files(\"erichartford/leetcode-solutions\", \"data\", unzip=True)" + "kaggle.api.dataset_download_files(\"erichartford/leetcode-solutions\", \"data\", unzip=True)\n", + "r = requests.get(lc_contests_data_source, allow_redirects=True)\n", + "with open(\"data/lc_contests.json\", \"wb\") as f:\n", + " for chunk in r.iter_content(chunk_size=1024):\n", + " if chunk:\n", + " f.write(chunk)" ] }, { @@ -64,6 +72,7 @@ "outputs": [], "source": [ "leetcode_solutions = pd.read_json(\"data/leetcode-solutions.jsonl\", lines=True)\n", + "leetcode_contests = pd.read_json(\"data/lc_contests.json\")\n", "\n", "# Create dataframe with columns INSTRUCTION, RESPONSE, SOURCE\n", "# The INSTRUCTION a random choice from ONE_STEP_TEMPLATES with the language and content filled in\n", @@ -83,7 +92,21 @@ " \"SOURCE\": data_source,\n", " }\n", " )\n", + "\n", + "oa_leetcode_contests = []\n", + "for index, row in leetcode_contests.iterrows():\n", + " oa_leetcode_contests.append(\n", + " {\n", + " \"INSTRUCTION\": row[\"instruction\"] + \"\\n\" + row[\"input\"],\n", + " \"RESPONSE\": row[\"output\"],\n", + " \"SOURCE\": \"https://github.com/Nan-Do/LeetCodeContestsDataset\",\n", + " }\n", + " )\n", + "\n", "oa_leet10k = pd.DataFrame(oa_leet10k)\n", + "oa_leetcode_contests = pd.DataFrame(oa_leetcode_contests)\n", + "\n", + "print(f\"oa_leet10k: {oa_leet10k.shape[0]}, oa_leetcode_contests: {oa_leetcode_contests.shape[0]}\")\n", "\n", "# Print the first 5 rows of the dataframe with full width and newline characters correctly displayed in the RESPONSE column\n", "with pd.option_context(\"display.max_colwidth\", 80):\n", @@ -94,7 +117,13 @@ " \"text-align\": \"left\",\n", " \"white-space\": \"pre-wrap\",\n", " }\n", - " )\n", + " ),\n", + " oa_leetcode_contests.head(5).style.set_properties(\n", + " **{\n", + " \"text-align\": \"left\",\n", + " \"white-space\": \"pre-wrap\",\n", + " }\n", + " ),\n", " )" ] }, @@ -106,9 +135,12 @@ "source": [ "# Upload dataset to HF\n", "oa_leet10k.to_parquet(\"oa_leet10k.parquet\", row_group_size=100, engine=\"pyarrow\")\n", - "ds = Dataset.from_parquet(\"oa_leet10k.parquet\")\n", + "ds_leet10k = Dataset.from_parquet(\"oa_leet10k.parquet\")\n", + "oa_leetcode_contests.to_parquet(\"oa_leetcode_contests.parquet\", row_group_size=100, engine=\"pyarrow\")\n", + "ds_leetcode_contests = Dataset.from_parquet(\"oa_leetcode_contests.parquet\")\n", "# Uncomment to push dataset to HF\n", - "# ds.push_to_hub(\"ehartford/oa_leet10k\")" + "# ds_leet10k.push_to_hub(\"ehartford/oa_leet10k\")\n", + "# ds_leetcode_contests.push_to_hub(\"ehartford/oa_leet10k\")" ] } ],