From 65f5c2bd9241024b1a1525275c2c23ff2cbd5138 Mon Sep 17 00:00:00 2001 From: Amir Pourmand Date: Thu, 3 Aug 2023 17:35:43 +0330 Subject: [PATCH] Add Wikipedia Persian Dataset (#3629) Currently, the Open-assistant model doesn't support Farsi. This is a text-only dataset to learn Farsi (Persian). One of my friends fine-tuned LLaMa on this dataset and It could understand Farsi grammar and word usage very well. If the Open-assistant team wants to add support to Farsi, this should be the first step. I have transformed the dataset into the standard that has been mentioned [here](https://projects.laion.ai/Open-Assistant/docs/data/datasets) and uploaded it to [my huggingface account](https://huggingface.co/datasets/pourmand1376/fa-wikipedia). - #2974 --- data/datasets/__init__.py | 1 + data/datasets/fa-wikipedia/README.md | 6 ++++++ 2 files changed, 7 insertions(+) create mode 100644 data/datasets/fa-wikipedia/README.md diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py index 6cf9e7f027..92a82179f0 100644 --- a/data/datasets/__init__.py +++ b/data/datasets/__init__.py @@ -4,6 +4,7 @@ "tv_dialogue": "sedthh/tv_dialogue", # TV and Movie dialogues and transcripts "fd_dialogue": "sedthh/fd_dialogue", # TV and Movie dialogues and transcripts from ForeverDreaming "tlcv2.0_oa": "pythainlp/tlcv2.0_oa", # Thai classical literature texts + "fa-wikipedia": "pourmand1376/fa-wikipedia", # Farsi Wikipedia texts } INSTRUCTION_DATASETS = { diff --git a/data/datasets/fa-wikipedia/README.md b/data/datasets/fa-wikipedia/README.md new file mode 100644 index 0000000000..728190a737 --- /dev/null +++ b/data/datasets/fa-wikipedia/README.md @@ -0,0 +1,6 @@ +This dataset is crawled from +[farsi wikipedia](https://fa.wikipedia.org/wiki/%D8%B5%D9%81%D8%AD%D9%87%D9%94_%D8%A7%D8%B5%D9%84%DB%8C). +This is valuable clean text data in persian (Farsi). It contains information +about all subjects. + +It has 2.53M Articles.