diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..7f578f1 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,24 @@ +# Check http://editorconfig.org for more information +# This is the main config file for this project: +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +indent_style = space +indent_size = 2 +trim_trailing_whitespace = true + +[*.{py, pyi}] +indent_style = space +indent_size = 4 + +[Makefile] +indent_style = tab + +[*.md] +trim_trailing_whitespace = false + +[*.{diff,patch}] +trim_trailing_whitespace = false diff --git a/.github/.stale.yml b/.github/.stale.yml new file mode 100644 index 0000000..dc90e5a --- /dev/null +++ b/.github/.stale.yml @@ -0,0 +1,17 @@ +# Number of days of inactivity before an issue becomes stale +daysUntilStale: 60 +# Number of days of inactivity before a stale issue is closed +daysUntilClose: 7 +# Issues with these labels will never be considered stale +exemptLabels: + - pinned + - security +# Label to use when marking an issue as stale +staleLabel: wontfix +# Comment to post when marking an issue as stale. Set to `false` to disable +markComment: > + This issue has been automatically marked as stale because it has not had + recent activity. It will be closed if no further activity occurs. Thank you + for your contributions. +# Comment to post when closing a stale issue. Set to `false` to disable +closeComment: false diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..036bffc --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,30 @@ +--- +name: ๐ Bug report +about: If something isn't working ๐ง +title: "" +labels: bug +assignees: +--- + +## ๐ Bug Report + + + +## ๐ฌ How To Reproduce + +Steps to reproduce the behavior: + +1. ... + +### Environment + +- OS: [e.g. Linux / Windows / macOS] +- Python version, get it with: + +```bash +python --version +``` + +## ๐ Additional context + + diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..8f2da54 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,3 @@ +# Configuration: https://help.github.com/en/github/building-a-strong-community/configuring-issue-templates-for-your-repository + +blank_issues_enabled: false diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..7ce8c12 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,15 @@ +--- +name: ๐ Feature request +about: Suggest an idea for this project ๐ +title: "" +labels: enhancement +assignees: +--- + +## ๐ Feature Request + + + +## ๐ Additional context + + diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md new file mode 100644 index 0000000..b2ca0d9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.md @@ -0,0 +1,25 @@ +--- +name: โ Question +about: Ask a question about this project ๐ +title: "" +labels: question +assignees: +--- + +## Checklist + + + +- [ ] I've searched the project's [`issues`](https://github.com/monologg/KoBigBird/issues?q=is%3Aissue) + +## โ Question + + + +How can I [...]? + +Is it possible to [...]? + +## ๐ Additional context + + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..4dab74c --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,7 @@ +## Description + + + +## Related Issue + + diff --git a/.github/images/kobigbird-logo.png b/.github/images/kobigbird-logo.png new file mode 100644 index 0000000..6244643 Binary files /dev/null and b/.github/images/kobigbird-logo.png differ diff --git a/.github/images/memory-usage.png b/.github/images/memory-usage.png new file mode 100644 index 0000000..6dc5f3b Binary files /dev/null and b/.github/images/memory-usage.png differ diff --git a/.github/images/sparse-attention.png b/.github/images/sparse-attention.png new file mode 100644 index 0000000..a2ef111 Binary files /dev/null and b/.github/images/sparse-attention.png differ diff --git a/.github/images/time-usage.png b/.github/images/time-usage.png new file mode 100644 index 0000000..7f641c4 Binary files /dev/null and b/.github/images/time-usage.png differ diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml new file mode 100644 index 0000000..411b93f --- /dev/null +++ b/.github/release-drafter.yml @@ -0,0 +1,21 @@ +name-template: "v$RESOLVED_VERSION" +tag-template: "v$RESOLVED_VERSION" + +categories: + - title: ":rocket: Features" + labels: [enhancement, feature] + - title: ":wrench: Fixes" + labels: [bug, bugfix, fix] + - title: ":toolbox: Maintenance & Refactor" + labels: [refactor, refactoring, chore] + - title: ":package: Build System & CI/CD & Test" + labels: [build, ci, testing, test] + - title: ":pencil: Documentation" + labels: [documentation] + - title: ":arrow_up: Dependencies updates" + labels: [dependencies] + +template: | + ## Whatโs Changed + + $CHANGES diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml new file mode 100644 index 0000000..0018f7f --- /dev/null +++ b/.github/workflows/linter.yml @@ -0,0 +1,32 @@ +name: linter + +on: [push] + +jobs: + check-lint: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + + - name: Cache pip + uses: actions/cache@v2 + with: + # This path is specific to Ubuntu + path: ~/.cache/pip + # Look to see if there is a cache hit for the corresponding requirements file + key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + ${{ runner.os }}- + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + - name: Check Lint (black, flake8, isort) + run: | + make quality diff --git a/.github/workflows/release-drafter.yml b/.github/workflows/release-drafter.yml new file mode 100644 index 0000000..a052646 --- /dev/null +++ b/.github/workflows/release-drafter.yml @@ -0,0 +1,20 @@ +name: Release Drafter + +on: + push: + # branches to consider in the event; optional, defaults to all + branches: + - master + # pull_request event is required only for autolabeler + pull_request: + # Only following types are handled by the action, but one can default to all as well + types: [opened, reopened, synchronize] + +jobs: + update_release_draft: + runs-on: ubuntu-latest + steps: + # Drafts your next Release notes as Pull Requests are merged into "master" + - uses: release-drafter/release-drafter@v5.15.0 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..de30ccd --- /dev/null +++ b/.gitignore @@ -0,0 +1,135 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Custom Ignore +.vscode/ +.idea/ +.DS_Store +test.ipynb diff --git a/.gitmessage b/.gitmessage new file mode 100644 index 0000000..eafd7d4 --- /dev/null +++ b/.gitmessage @@ -0,0 +1,16 @@ +# Title: Summary, imperative, don't end with a period +# No more than 50 chars. #### 50 chars is here: # + +# Remember blank line between title and body. + +# Body: Explain *what* and *why* (not *how*). +# Wrap at 72 chars. ################################## which is here: # + +# feat : ๊ธฐ๋ฅ (์๋ก์ด ๊ธฐ๋ฅ) +# fix : ๋ฒ๊ทธ (๋ฒ๊ทธ ์์ ) +# refactor: ๋ฆฌํฉํ ๋ง +# style : ์คํ์ผ (์ฝ๋ ํ์, ์ธ๋ฏธ์ฝ๋ก ์ถ๊ฐ: ๋น์ฆ๋์ค ๋ก์ง์ ๋ณ๊ฒฝ ์์) +# docs : ๋ฌธ์ (๋ฌธ์ ์ถ๊ฐ, ์์ , ์ญ์ ) +# dep : ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์์กด์ฑ (๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ถ๊ฐ, ๋ฒ์ ์์ ) +# test : ํ ์คํธ (ํ ์คํธ ์ฝ๋ ์ถ๊ฐ, ์์ , ์ญ์ : ๋น์ฆ๋์ค ๋ก์ง์ ๋ณ๊ฒฝ ์์) +# chore : ๊ธฐํ ๋ณ๊ฒฝ์ฌํญ (๋น๋ ์คํฌ๋ฆฝํธ ์์ ๋ฑ) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3e6c413 --- /dev/null +++ b/Makefile @@ -0,0 +1,33 @@ +clean: clean-pyc clean-test +quality: set-style-dep check-quality +style: set-style-dep set-style + +##### basic ##### +set-git: + git config --local commit.template .gitmessage + +set-style-dep: + pip3 install isort==5.9.3 black==21.7b0 flake8==3.9.2 + +set-style: + black --config pyproject.toml . + isort --settings-path pyproject.toml . + flake8 . + +check-quality: + black --config pyproject.toml --check . + isort --settings-path pyproject.toml --check-only . + flake8 . + +##### clean ##### +clean-pyc: + find . -name '*.pyc' -exec rm -f {} + + find . -name '*.pyo' -exec rm -f {} + + find . -name '*~' -exec rm -f {} + + find . -name '__pycache__' -exec rm -fr {} + + +clean-test: + rm -f .coverage + rm -f .coverage.* + rm -rf .pytest_cache + rm -rf .mypy_cache diff --git a/README.md b/README.md new file mode 100644 index 0000000..e887d3e --- /dev/null +++ b/README.md @@ -0,0 +1,122 @@ +
+ What is BigBird โข + How to Use โข + Pretraining โข + Evaluation Result โข + Docs โข + Citation +
+ ++ ํ๊ตญ์ด | + English +
+ + + ++ What is BigBird โข + How to Use โข + Pretraining โข + Evaluation Result โข + Docs โข + Citation +
+ ++ ํ๊ตญ์ด | + English +
+ + + ++ ํ๊ตญ์ด | + English +
+ +## Details + +- KoBigBird performance evaluation in `max_seq_length<=512` setting + +- Evaluated with a total of **5 Datasets** + + - Single Sentence Classification: `NSMC` + - Sentence Pair Classification: `KLUE-NLI`, `KLUE-STS` + - Question Answering: `Korquad 1.0`, `KLUE-MRC` + +- **Based on the [KLUE-Baseline](https://github.com/KLUE-benchmark/KLUE-baseline) code with some modifications** + + - Add `nsmc` and `korquad 1.0` tasks + - Fix to be compatible with `transformers==4.11.3` + +- Sequence Classification is trained with a length of **128** and Question Answering with a length of **512** + + - **Full Attention** instead of Sparse Attention (Automatically changed to Full Attention with the following log) + + ```text + Attention type 'block_sparse' is not possible if sequence_length: 300 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. + Changing attention type to 'original_full'... + ``` + +## Result + +| | NSMC+ ํ๊ตญ์ด | + English +
+ +## Details + +- `max_seq_length<=512` ํ๊ฒฝ์์์ KoBigBird ์ฑ๋ฅ ํ๊ฐ + +- ์ด **5๊ฐ์ Dataset**์ผ๋ก ํ๊ฐ + + - Single Sentence Classification: `NSMC` + - Sentence Pair Classification: `KLUE-NLI`, `KLUE-STS` + - Question Answering: `Korquad 1.0`, `KLUE-MRC` + +- **[KLUE-Baseline](https://github.com/KLUE-benchmark/KLUE-baseline)์ ์ฝ๋๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ์ผ๋ถ ์์ ํ์ฌ ํ์ต** + + - `nsmc`์ `korquad 1.0` task ์ถ๊ฐ + - `transformers==4.11.3`์ ํธํ๋๋๋ก ์์ + +- Sequence Classification์ **128**, Question Answering์ **512**์ ๊ธธ์ด๋ก ํ์ต + + - Sparse Attention์ด ์๋ **Full Attention**์ผ๋ก ์ธํ (์๋์ ๋ก๊ทธ๊ฐ ๋์ค๋ฉด์ ์๋์ผ๋ก Full Attention์ผ๋ก ๋ณ๊ฒฝ) + + ```text + Attention type 'block_sparse' is not possible if sequence_length: 300 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. + Changing attention type to 'original_full'... + ``` + +## Result + +| | NSMC+ ํ๊ตญ์ด | + English +
+ +## About Dataset + +| Dataset | Task | Length (median) | Length (max) | +| ------------------ | ----------------------- | --------------: | -----------: | +| **TyDi QA** | Question Answering | 6,165 | 67,135 | +| **Korquad 2.1** | Question Answering | 5,777 | 486,730 | +| **Fake News** | Sequence Classification | 564 | 17,488 | +| **Modu Sentiment** | Sequence Classification | 185 | 5,245 | + +- `Length`๋ subword token์ ๊ธฐ์ค์ผ๋ก ๊ณ์ฐํ์ต๋๋ค. +- [TyDi QA](https://github.com/google-research-datasets/tydiqa)๋ ๋ณธ๋ `๋ค๊ตญ์ด(multilingual) ๋ฐ์ดํฐ์ `์ด๋ฉฐ `์-์๋์ค (BoolQA)` ๋ต๋ณ ๋ฐ์ดํฐ๋ฅผ ํฌํจํฉ๋๋ค. **๋ณธ ํ๋ก์ ํธ์์๋ ํ๊ตญ์ด ๋ฐ์ดํฐ์ ๋ง์ ์ฌ์ฉํ์ผ๋ฉฐ, ์-์๋์ค ๋ต๋ณ ๋ฐ์ดํฐ ๋ํ ์ ์ธํ์์ต๋๋ค.** + +## Setup + +### 1. Requirements + +```bash +pip3 install -r requirements.txt +``` + +### 2. Prepare Dataset + +#### 1) Question Answering + +```bash +bash download_qa_dataset.sh +``` + +#### 2) Sequence Classification + +- **์๋์ ๋งํฌ๋ฅผ ํตํด ๋ฐ์ดํฐ๋ฅผ ๋ค์ด๋ก๋ ํ, ๋ฐ์ดํฐ๋ฅผ `--data_dir` ๊ฒฝ๋ก์ ์์น์์ผ ์ฃผ์ธ์.** +- `Fake news`: [Korean Fake news](https://github.com/2alive3s/Fake_news/blob/b43638105f4802de5773c21afe539157ebed6cc5/data/mission2_train.zip) (`mission2_train.csv`) +- `Modu sentiment corpus`: [๊ฐ์ฑ ๋ถ์ ๋ง๋ญ์น 2020](https://corpus.korean.go.kr) (`EXSA2002108040.json`) + +## How to Run + +- ํฐ ๊ท๋ชจ์ Long sequence ๋ฐ์ดํฐ์ ์ ํ์ต์ํค๊ธฐ ์ํด์ **TPU ์ธ์คํด์ค**์์ ์คํํ๋ ๊ฒ์ ๊ถ์ฅํฉ๋๋ค. +- ํ๊ฐ ๊ฒฐ๊ณผ๋ ๋ชจ๋ [torch-xla-1.8.1](https://github.com/pytorch/xla#-consume-prebuilt-compute-vm-images) ํ๊ฒฝ์์ `TPU v3-8`์ ์ด์ฉํ์ฌ ํ์ต ๋ฐ ํ๊ฐํ์ต๋๋ค. +- TPU๊ฐ ์๋ GPU๋ก ํ์ตํ๊ณ ์ถ์ ์ ์คํฌ๋ฆฝํธ ์์ `--use_tpu` ์ธ์๋ฅผ ์ ์ธํ๋ฉด ๋ฉ๋๋ค. + +```bash +bash scripts/run_{$TASK_NAME}.sh # kobigbird +bash scripts/run_{$TASK_NAME}_short.sh # klue roberta +``` + +```bash +bash scripts/run_tydiqa.sh # tydiqa +bash scripts/run_korquad_2.sh # korquad 2.1 +bash scripts/run_fake_news.sh # fake news +bash scripts/run_modu_sentiment.sh # modu sentiment +``` + +## Results + +- Sequence Classification์ ๊ฒฝ์ฐ `train:test=8:2` ๋ก splitํ์ฌ ํ๊ฐํ์ต๋๋ค. +- Korquad 2.1 ๋ฐ์ดํฐ์ ์ ๊ฒฝ์ฐ, ์ปดํจํ ์์์ ํ๊ณ๋ก **ํ์ต ๋ฐ์ดํฐ์ ์ ์ผ๋ถ๋ง์ผ๋ก ๋ชจ๋ธ์ ํ์ต**ํ์ต๋๋ค. + - `--all_korquad_2_sample` ์ธ์๋ฅผ ์คํฌ๋ฆฝํธ์ ์ถ๊ฐํ๋ฉด ์ ์ฒด ๋ฐ์ดํฐ๋ฅผ ์ด์ฉํ์ฌ ํ์ต ๊ฐ๋ฅ +- `KoBigBird`์ ๊ฒฝ์ฐ Question Answering์ **4096**, Sequence Classification์ **1024**์ ๊ธธ์ด๋ก ํ์ตํ์ต๋๋ค. +- `KLUE RoBERTa`๋ **512**์ ๊ธธ์ด๋ก ํ์ตํ์ต๋๋ค. + +| | TyDi QA+ ํ๊ตญ์ด | + English +
+ +## About Dataset + +| Dataset | Task | Length (median) | Length (max) | +| ------------------ | ----------------------- | --------------: | -----------: | +| **TyDi QA** | Question Answering | 6,165 | 67,135 | +| **Korquad 2.1** | Question Answering | 5,777 | 486,730 | +| **Fake News** | Sequence Classification | 564 | 17,488 | +| **Modu Sentiment** | Sequence Classification | 185 | 5,245 | + +- `Length` is calculated based on subword token. +- [TyDi QA](https://github.com/google-research-datasets/tydiqa) is originally `multilingual` and contains `BoolQA` cases. **We only use korean samples and skip BoolQA samples.** + +## Setup + +### 1. Requirements + +```bash +pip3 install -r requirements.txt +``` + +### 2. Prepare Dataset + +#### 1) Question Answering + +```bash +bash download_qa_dataset.sh +``` + +#### 2) Sequence Classification + +- **After downloading the data through the link below, place the data in the `--data_dir` path.** +- `Fake news`: [Korean Fake news](https://github.com/2alive3s/Fake_news/blob/b43638105f4802de5773c21afe539157ebed6cc5/data/mission2_train.zip) (`mission2_train.csv`) +- `Modu sentiment corpus`: [๊ฐ์ฑ ๋ถ์ ๋ง๋ญ์น 2020](https://corpus.korean.go.kr) (`EXSA2002108040.json`) + +## How to Run + +- We highly recommend to run the scripts on **TPU instance** in order to train and evaluate large and long-sequence datasets. +- We trained and evaluated the models on the [torch-xla-1.8.1](https://github.com/pytorch/xla#-consume-prebuilt-compute-vm-images) environment with `TPU v3-8`. +- Disable `--use_tpu` argument for GPU training. + +```bash +bash scripts/run_{$TASK_NAME}.sh # kobigbird +bash scripts/run_{$TASK_NAME}_short.sh # klue roberta +``` + +```bash +bash scripts/run_tydiqa.sh # tydiqa +bash scripts/run_korquad_2.sh # korquad 2.1 +bash scripts/run_fake_news.sh # fake news +bash scripts/run_modu_sentiment.sh # modu sentiment +``` + +## Results + +- In the case of sequence classification, it was evaluated by splitting `train:test=8:2`. +- For `korquad 2.1`, we **only use the subset of the train dataset** because of limited computational resources. + - Enable `--all_korquad_2_sample` argument in order to use full train dataset. +- In the case of `KoBigBird`, question answering was trained with a length of **4096** and sequence classification was trained with a length of **1024**. +- `KLUE RoBERTa` was trained with a length of **512**. + +| | TyDi QA", + "
", + "", + "", + "", + "", + "", + "
|