diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 29bcc92..e9aa9ae 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -1,32 +1,141 @@ -name: Extraction Libraries -run-name: ${{ github.actor }} is testing out Extraction Libraries using GitHub Actions 🚀 -on: [push] +name: Build, Test, Lint & Upload to TestPypi and Pypi for Cellar_Extractor +on: + push: + branches: [ cellar ] + pull_request: + branches: [ cellar ] + jobs: - Explore-Extraction-Libraries: - runs-on: ubuntu-latest + test: + name: Test on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ['3.9', '3.10', '3.11', '3.12'] + steps: - - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event." - - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!" - - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}." - - name: Check out repository code - uses: actions/checkout@v3 - - name: Set up Python 3.9 - uses: actions/setup-python@v4 + - name: Check out the repository + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: ${{ matrix.python-version }} + - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -e cellar/ - # pip install echr-extractor - - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner." - - run: echo "🖥️ The workflow is now ready to test your code on the runner." - - name: List files in the repository + pip install setuptools wheel + pip install -r requirements.txt + + - name: Install package for testing run: | - ls ${{ github.workspace }} - - run: echo "🍏 This job's status is ${{ job.status }}." - - name: Test with pytest + pip install -e cellar/ + + - name: Run tests with pytest run: | - pip install pytest - pip install pytest-cov + pip install pytest pytest-cov pytest tests.py --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html + + build: + needs: test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + python-version: '3.9' + - run: | + python -m pip install --upgrade pip + pip install setuptools wheel + - run: python cellar/setup.py sdist bdist_wheel + - uses: actions/upload-artifact@v4 + with: + name: universal-wheels + path: | + dist/*.whl + dist/*.tar.gz + if-no-files-found: error + + testpypi-publish: + name: Publish to TestPyPI + needs: build + runs-on: ubuntu-latest + environment: + name: testpypi + url: https://test.pypi.org/project/cellar-extractor/ + permissions: + id-token: write + + steps: + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: dist + + - name: Publish distribution to TestPyPi + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ + packages-dir: dist/* + + pypi-publish: + name: Publish to PyPI + needs: + - testpypi-publish + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/project/cellar-extractor/ + permissions: + id-token: write + + steps: + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: dist + + - name: Publish distribution to PyPi + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages_dir: dist/*/ + + github-release: + name: Sign the Python distribution with Sigstore and upload them to GitHub Releases + needs: + - pypi-publish + runs-on: ubuntu-latest + permissions: + id-token: write + contents: write + + steps: + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: dist + + - name: Sign the Python distribution with Sigstore + uses: sigstore/gh-action-sigstore-python@v2.1.1 + with: + inputs: >- + ./dist/**/*.whl + ./dist/**/*.tar.gz + + - name: Create Github release + env: + GITHUB_TOKEN: ${{ github.token }} + run: >- + gh release create + '${{ github.ref_name }}' + --repo '${{ github.repository }}' + --notes "" + + - name: Upload artifact signatures to Github release + env: + GITHUB_TOKEN: ${{ github.token }} + run: >- + gh release upload + '${{ github.ref_name }}' ./dist/**/* + --repo '${{ github.repository }}' diff --git a/README.md b/README.md index 34ee8a2..1842f5f 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,186 @@ -# extraction_libraries -Python library for extracting caselaw data from Cellar. -Full documentation available at [cellar-extractor](https://pypi.org/project/cellar-extractor/). +## Cellar extractor +This library contains two functions to get cellar case law data from eurlex. + +## Version +Python 3.9 onwards * + +## Tests +![Workflow Status](https://github.com/maastrichtlawtech/extraction_libraries/actions/workflows/github-actions.yml/badge.svg) + + +## Contributors + + +
+
+ + Pranav Bapat + + |
+
+
+ + Piotr Lewandowski + + |
+
+
+ + shashankmc + + |
+
+
+ + gijsvd + + |
+
+
+ + venvis + + |
+
pip install cellar-extractor
+
+## What are the functions?
+get_cellar
get_cellar_extra
get_nodes_and_edges_lists
filter_subject_matter
Analyzer
Writing
Writing
class has three functions : to_csv()
- Writes the operative part along with celex id into a csv fileto_json()
- Writes the operative part along with celex id into a json fileto_txt()
- Writes the operative part along with celex id into a txt fileget_cellar
get_cellar_extra
get_nodes_and_edges_lists
filter_subject_matter
Analyzer
Writing
Create a callback of the instance of the class initiated and pass a list as it's value.
+ +```python +import cellar_extractor as cell +instance=cell.Analyzer(celex_id:str) +output_list=instance() +print(output_list) # prints operative part of the Case as a list +``` + + +The Writing Class also takes a celex id , upon initializing the class , through the means of the constructor and writes the content of its operative part into different files , depending on the function called
+ +```python +import cellar_extractor as cell +instance=cell.Writing(celex_id:str) +output=instance.to_csv()#for csv +output=instance.to_txt()#for txt +output=instance.to_json()#for json + +``` diff --git a/cellar/README.md b/cellar/README.md index f5d9d64..271ea7e 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -2,7 +2,11 @@ This library contains two functions to get cellar case law data from eurlex. ## Version -Python 3.9 +Python 3.9 onwards * + +## Tests +![Workflow Status](https://github.com/maastrichtlawtech/extraction_libraries/actions/workflows/github-actions.yml/badge.svg) + ## Contributors diff --git a/cellar/cellar_extractor/json_to_csv.py b/cellar/cellar_extractor/json_to_csv.py index 7d757ee..b658183 100644 --- a/cellar/cellar_extractor/json_to_csv.py +++ b/cellar/cellar_extractor/json_to_csv.py @@ -76,7 +76,7 @@ def json_to_csv(json_data): # Making commas as the only value separator in the dataset value = re.sub(r",", ";", str(value)) # Remove HTML tags - value = BeautifulSoup(value, "lxml").text + value = BeautifulSoup(value, "html.parser").text for j in [j for j, x in enumerate(COLS) if x == title]: data[j] = value diff --git a/cellar/cellar_extractor/operative_extractions.py b/cellar/cellar_extractor/operative_extractions.py index 2414a1a..2b873c6 100644 --- a/cellar/cellar_extractor/operative_extractions.py +++ b/cellar/cellar_extractor/operative_extractions.py @@ -24,7 +24,7 @@ def html_page_structure_one(self) -> list: table structure . The relevant text lies inside the coj-bold class of the span tag. """ website = requests.get(self.url, timeout=60).text - parser = BeautifulSoup(website, 'lxml') + parser = BeautifulSoup(website, 'html.parser') div = parser.find_all('table') # Find all tables tag from the website one = [] for divs in div: @@ -50,7 +50,7 @@ def html_page_structure_two(self) -> list: comes after the keyword operative of the previous span tag. """ website = requests.get(self.url, timeout=60).text - parser = BeautifulSoup(website, 'lxml') + parser = BeautifulSoup(website, 'html.parser') p = parser.find_all('p') two = [] for para in p: @@ -69,7 +69,7 @@ def structure_three(self) -> list: table structure. The relevant text lies inside the coj-bold class of the span tag. """ website = requests.get(self.url, timeout=60).text - parser = BeautifulSoup(website, 'lxml') + parser = BeautifulSoup(website, 'html.parser') table = parser.find_all('table') three = [] for tables in table: @@ -92,7 +92,7 @@ def structure_four(self) -> list: keyword operative of the previous span tag. """ website = requests.get(self.url, timeout=60).text - parser = BeautifulSoup(website, 'lxml') + parser = BeautifulSoup(website, 'html.parser') p = parser.find_all('p') four = [] for para in p: @@ -116,7 +116,7 @@ def structure_five(self) -> list: comes after the keyword operative of the previous span tag. """ website = requests.get(self.url, timeout=60).text - parser = BeautifulSoup(website, 'lxml') + parser = BeautifulSoup(website, 'html.parser') p = parser.find_all('p') five = [] for para in p: @@ -142,7 +142,7 @@ def structure_six(self) -> list: part of the respective h2 tag. """ website = requests.get(self.url, timeout=60).text - parser = BeautifulSoup(website, 'lxml') + parser = BeautifulSoup(website, 'html.parser') div = parser.find_all('h2') six = [] for h2 in div: @@ -162,7 +162,7 @@ def structure_seven(self) -> list: the p tag , with the class name=normal. """ website = requests.get(self.url, timeout=60).text - parser = BeautifulSoup(website, 'lxml') + parser = BeautifulSoup(website, 'html.parser') div = parser.find_all('table') seven = [] for divs in div: @@ -197,7 +197,7 @@ def structure_eight(self) -> list: the tbody tag.Returns a list as output. """ website = requests.get(self.url, timeout=60).text - parser = BeautifulSoup(website, 'lxml') + parser = BeautifulSoup(website, 'html.parser') tbody = parser.find_all('tbody') eight = [] @@ -224,7 +224,7 @@ def structure_nine(self) -> list: tag after the p tag where the keywords "on those grounds" exist. """ website = requests.get(self.url, timeout=60).text - parser = BeautifulSoup(website, 'lxml') + parser = BeautifulSoup(website, 'html.parser') nine = [] div = parser.find_all('p') for divs in div: @@ -242,7 +242,7 @@ def structure_eleven(self) -> list: tag after the b tag where the keywords "operative part" exist. """ website = requests.get(self.url, timeout=60).text - parser = BeautifulSoup(website, 'lxml') + parser = BeautifulSoup(website, 'html.parser') bold = parser.find_all('b') eleven = [] @@ -265,7 +265,7 @@ def structure_ten(self): "On those grounds". """ website = requests.get(self.url, timeout=60).text - parser = BeautifulSoup(website, 'lxml') + parser = BeautifulSoup(website, 'html.parser') appender = [] for string in parser.stripped_strings: diff --git a/cellar/pyproject.toml b/cellar/pyproject.toml new file mode 100644 index 0000000..082ddd1 --- /dev/null +++ b/cellar/pyproject.toml @@ -0,0 +1,29 @@ +[build-system] +requires = ["setuptools>=42", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "cellar_extractor" +version = "1.1.3" +description = "Library for extracting cellar data" +authors = [ + { name = "LawTech Lab", email = "law-techlab@maastrichtuniversity.nl" } +] +license = { text = "MIT" } +readme = { file = "README.md", content-type = "text/markdown" } +keywords = ["cellar", "extractor"] +dependencies = [ + "bs4", + "SPARQLWrapper==2.0.0", + "requests==2.26.0", + "pandas", + "xmltodict==0.13.0", + "tqdm" +] + +[project.urls] +"Bug Tracker" = "https://github.com/maastrichtlawtech/extraction_libraries" +"Build Source" = "https://github.com/maastrichtlawtech/extraction_libraries" + +[tool.setuptools.packages.find] +include = ["cellar_extractor", "cellar_extractor.operative_extractions"] diff --git a/cellar/setup.py b/cellar/setup.py index 50bab1b..71735c0 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -10,11 +10,11 @@ setup( name='cellar_extractor', packages=find_packages(include=['cellar_extractor', 'cellar_extractor.operative_extractions']), - version='1.0.61', + version='1.1.3', description='Library for extracting cellar data', author='LawTech Lab', license='MIT', - install_requires=['bs4','SPARQLWrapper==2.0.0', 'requests==2.26.0', 'pandas','lxml==4.6.3','xmltodict==0.13.0','tqdm'], + install_requires=['bs4','SPARQLWrapper', 'requests', 'pandas','xmltodict>=0.9.0','tqdm'], author_email='p.lewandowski@student.maastrichtuniversity.nl', keywords=['cellar', 'extractor'], long_description=long_descr, diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..87823cb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,35 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "cellar_extractor" +dynamic = ["readme"] +version = "1.1.3" +requires-python = ">= 3.9" +description = "Library for extracting cellar data" +authors = [ + { name = "LawTech Lab", email = "law-techlab@maastrichtuniversity.nl" } +] +license = { text = "MIT" } +keywords = ["cellar", "extractor"] +dependencies = [ + "bs4", + "SPARQLWrapper==2.0.0", + "requests==2.26.0", + "pandas", + "xmltodict==0.13.0", + "tqdm" +] + +[project.urls] +"Bug Tracker" = "https://github.com/maastrichtlawtech/extraction_libraries" +"Build Source" = "https://github.com/maastrichtlawtech/extraction_libraries" + +[tool.setuptools.packages.find] +where = ["cellar"] +include = ["cellar_extractor", "cellar_extractor.operative_extractions"] +namespaces = false + +[tool.setuptools.dynamic] +readme = {file = ["cellar/README.md"]} diff --git a/rechtspraak/rechtspraak_extractor/__pycache__/rechtspraak_functions.cpython-310.pyc b/rechtspraak/rechtspraak_extractor/__pycache__/rechtspraak_functions.cpython-310.pyc deleted file mode 100644 index cddcd27..0000000 Binary files a/rechtspraak/rechtspraak_extractor/__pycache__/rechtspraak_functions.cpython-310.pyc and /dev/null differ diff --git a/rechtspraak/rechtspraak_extractor/tests/__pycache__/rechtspraak_functions.cpython-310.pyc b/rechtspraak/rechtspraak_extractor/tests/__pycache__/rechtspraak_functions.cpython-310.pyc deleted file mode 100644 index c88f713..0000000 Binary files a/rechtspraak/rechtspraak_extractor/tests/__pycache__/rechtspraak_functions.cpython-310.pyc and /dev/null differ diff --git a/requirements.txt b/requirements.txt index f2c292d..824fa3c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,8 @@ -# For rechtspraak extractor +# For cellar extractor extractor xmltodict requests bs4 -lxml +tqdm +SPARQLWrapper +pandas +