From 0d4adb5f1ad6c38a828370414a584dd485165dce Mon Sep 17 00:00:00 2001 From: Amir Pourmand Date: Thu, 3 Aug 2023 17:39:31 +0330 Subject: [PATCH] Add Isna Persian Dataset (#3631) The level of importance of this data is less than Wikipedia. So, I think [this pull request](https://github.com/LAION-AI/Open-Assistant/pull/3629) should be merged first. I have uploaded the data to [huggingface](https://huggingface.co/datasets/pourmand1376/isna-news) according to Open-assistant's standard. So, it shouldn't need any processing. --------- Co-authored-by: Oliver Stanley --- data/datasets/__init__.py | 1 + data/datasets/fa-isna-news/README.md | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 data/datasets/fa-isna-news/README.md diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py index 92a82179f0..7b2c077ea4 100644 --- a/data/datasets/__init__.py +++ b/data/datasets/__init__.py @@ -4,6 +4,7 @@ "tv_dialogue": "sedthh/tv_dialogue", # TV and Movie dialogues and transcripts "fd_dialogue": "sedthh/fd_dialogue", # TV and Movie dialogues and transcripts from ForeverDreaming "tlcv2.0_oa": "pythainlp/tlcv2.0_oa", # Thai classical literature texts + "fa-isna-news": "pourmand1376/isna-news", # Isna Persian News "fa-wikipedia": "pourmand1376/fa-wikipedia", # Farsi Wikipedia texts } diff --git a/data/datasets/fa-isna-news/README.md b/data/datasets/fa-isna-news/README.md new file mode 100644 index 0000000000..fa216bef44 --- /dev/null +++ b/data/datasets/fa-isna-news/README.md @@ -0,0 +1,2 @@ +This text-only dataset is crawled from [Isna news](https://isna.ir/). This is +biggest farsi news agency and thus the text is pretty clean.