diff --git a/setup.py b/setup.py index a8f8e72..850d0df 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def read(*relpath): setup( name="split_lang", - version="1.3.4", + version="1.3.5", description="A package for splitting text by languages through concatenating over split substrings based on their language", long_description=read("README.md"), long_description_content_type="text/markdown", diff --git a/split-lang-demo.ipynb b/split-lang-demo.ipynb index f48e291..2b28a68 100644 --- a/split-lang-demo.ipynb +++ b/split-lang-demo.ipynb @@ -1,196 +1,196 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "%%capture\n", - "%pip install split-lang==1.3.4" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ + "cells": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "from split_lang import LangSplitter\n", - "lang_splitter = LangSplitter()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "%pip install split-lang==1.3.5" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "0|zh:你喜欢看\n", - "1|ja:アニメ\n", - "2|zh:吗\n" - ] - } - ], - "source": [ - "text = \"你喜欢看アニメ吗\"\n", - "\n", - "substr = lang_splitter.split_by_lang(\n", - " text=text,\n", - ")\n", - "for index, item in enumerate(substr):\n", - " print(f\"{index}|{item.lang}:{item.text}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from split_lang import LangSplitter\n", + "lang_splitter = LangSplitter()" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "0|zh:你喜欢看\n", - "1|ja:アニメ\n", - "2|zh:吗?我也喜欢看\n", - "----------------------\n", - "0|en:Please star this project on GitHub, Thanks you. I love you\n", - "1|zh:请加星这个项目,谢谢你。我爱你\n", - "2|ja:この項目をスターしてください、ありがとうございます!愛してる\n", - "----------------------\n", - "0.007998943328857422\n" - ] - } - ], - "source": [ - "lang_splitter.merge_across_punctuation=True\n", - "import time\n", - "texts = [\n", - " \"你喜欢看アニメ吗?我也喜欢看\",\n", - " \"Please star this project on GitHub, Thanks you. I love you请加星这个项目,谢谢你。我爱你この項目をスターしてください、ありがとうございます!愛してる\",\n", - "]\n", - "time1 = time.time()\n", - "for text in texts:\n", - " substr = lang_splitter.split_by_lang(\n", - " text=text,\n", - " )\n", - " for index, item in enumerate(substr):\n", - " print(f\"{index}|{item.lang}:{item.text}\")\n", - " print(\"----------------------\")\n", - "time2 = time.time()\n", - "print(time2 - time1)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0|zh:你喜欢看\n", + "1|ja:アニメ\n", + "2|zh:吗\n" + ] + } + ], + "source": [ + "text = \"你喜欢看アニメ吗\"\n", + "\n", + "substr = lang_splitter.split_by_lang(\n", + " text=text,\n", + ")\n", + "for index, item in enumerate(substr):\n", + " print(f\"{index}|{item.lang}:{item.text}\")" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "0|zh:你喜欢看\n", - "1|ja:アニメ\n", - "2|zh:吗\n", - "3|punctuation:?\n", - "4|zh:我也喜欢看\n", - "----------------------\n", - "0|en:Please star this project on GitHub\n", - "1|punctuation:, \n", - "2|en:Thanks you\n", - "3|punctuation:. \n", - "4|en:I love you\n", - "5|zh:请加星这个项目\n", - "6|punctuation:,\n", - "7|zh:谢谢你\n", - "8|punctuation:。\n", - "9|zh:我爱你\n", - "10|ja:この項目をスターしてください\n", - "11|punctuation:、\n", - "12|ja:ありがとうございます\n", - "13|punctuation:!\n", - "14|ja:愛してる\n", - "----------------------\n", - "0.005997896194458008\n" - ] - } - ], - "source": [ - "lang_splitter.merge_across_punctuation = False\n", - "time1 = time.time()\n", - "for text in texts:\n", - " substr = lang_splitter.split_by_lang(\n", - " text=text,\n", - " )\n", - " for index, item in enumerate(substr):\n", - " print(f\"{index}|{item.lang}:{item.text}\")\n", - " print(\"----------------------\")\n", - "time2 = time.time()\n", - "print(time2 - time1)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0|zh:你喜欢看\n", + "1|ja:アニメ\n", + "2|zh:吗?我也喜欢看\n", + "----------------------\n", + "0|en:Please star this project on GitHub, Thanks you. I love you\n", + "1|zh:请加星这个项目,谢谢你。我爱你\n", + "2|ja:この項目をスターしてください、ありがとうございます!愛してる\n", + "----------------------\n", + "0.007998943328857422\n" + ] + } + ], + "source": [ + "lang_splitter.merge_across_punctuation=True\n", + "import time\n", + "texts = [\n", + " \"你喜欢看アニメ吗?我也喜欢看\",\n", + " \"Please star this project on GitHub, Thanks you. I love you请加星这个项目,谢谢你。我爱你この項目をスターしてください、ありがとうございます!愛してる\",\n", + "]\n", + "time1 = time.time()\n", + "for text in texts:\n", + " substr = lang_splitter.split_by_lang(\n", + " text=text,\n", + " )\n", + " for index, item in enumerate(substr):\n", + " print(f\"{index}|{item.lang}:{item.text}\")\n", + " print(\"----------------------\")\n", + "time2 = time.time()\n", + "print(time2 - time1)" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "0|zh:衬衫的价格是\n", - "1|digit:9.15\n", - "2|zh:便士\n" - ] + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0|zh:你喜欢看\n", + "1|ja:アニメ\n", + "2|zh:吗\n", + "3|punctuation:?\n", + "4|zh:我也喜欢看\n", + "----------------------\n", + "0|en:Please star this project on GitHub\n", + "1|punctuation:, \n", + "2|en:Thanks you\n", + "3|punctuation:. \n", + "4|en:I love you\n", + "5|zh:请加星这个项目\n", + "6|punctuation:,\n", + "7|zh:谢谢你\n", + "8|punctuation:。\n", + "9|zh:我爱你\n", + "10|ja:この項目をスターしてください\n", + "11|punctuation:、\n", + "12|ja:ありがとうございます\n", + "13|punctuation:!\n", + "14|ja:愛してる\n", + "----------------------\n", + "0.005997896194458008\n" + ] + } + ], + "source": [ + "lang_splitter.merge_across_punctuation = False\n", + "time1 = time.time()\n", + "for text in texts:\n", + " substr = lang_splitter.split_by_lang(\n", + " text=text,\n", + " )\n", + " for index, item in enumerate(substr):\n", + " print(f\"{index}|{item.lang}:{item.text}\")\n", + " print(\"----------------------\")\n", + "time2 = time.time()\n", + "print(time2 - time1)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0|zh:衬衫的价格是\n", + "1|digit:9.15\n", + "2|zh:便士\n" + ] + } + ], + "source": [ + "lang_splitter.merge_across_digit = False\n", + "texts = [\n", + " \"衬衫的价格是9.15便士\",\n", + "]\n", + "for text in texts:\n", + " substr = lang_splitter.split_by_lang(\n", + " text=text,\n", + " )\n", + " for index, item in enumerate(substr):\n", + " print(f\"{index}|{item.lang}:{item.text}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "melotts", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" } - ], - "source": [ - "lang_splitter.merge_across_digit = False\n", - "texts = [\n", - " \"衬衫的价格是9.15便士\",\n", - "]\n", - "for text in texts:\n", - " substr = lang_splitter.split_by_lang(\n", - " text=text,\n", - " )\n", - " for index, item in enumerate(substr):\n", - " print(f\"{index}|{item.lang}:{item.text}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "melotts", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "nbformat": 4, + "nbformat_minor": 2 }