Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Egork/flake8 #171

Open
wants to merge 34 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
4f4c5c6
Moving vila test to a class so that pytest ./test works locally
egork520 Oct 19, 2022
ff18984
Added notes on how to run unit tests locally
egork520 Oct 19, 2022
7f7bd66
Moving change of directory to the setUp class. Locally tests fails in…
egork520 Oct 19, 2022
747ace4
Moving change of directory to the setUp class. Locally tests fails in…
egork520 Oct 20, 2022
a425583
Creating variable for the fixtures path
egork520 Oct 20, 2022
38b76b3
Adding pytest.ini no need to type the folder for the tests
egork520 Oct 20, 2022
425b072
Adding exact command for selecting tests, removing directory name, we…
egork520 Oct 20, 2022
5c1ec12
Adding test install dependencies, for running tests on multiple cpus,…
egork520 Oct 20, 2022
062f953
Command for running tests on multiple cpus
egork520 Oct 20, 2022
55689ae
Adding test requirements installation to the mmda-ci.yml
egork520 Oct 20, 2022
9e1bf61
Adding plugin for running converage more easily, removing coverage
egork520 Oct 21, 2022
6fbf179
Adding coverage config file,
egork520 Oct 21, 2022
4035404
Adding coverage lower bound 57%, specifying module for which to measu…
egork520 Oct 21, 2022
9d9d4d7
Removing individual config files in favore of setup.cfg file
egork520 Oct 21, 2022
7739963
Update tests/test_predictors/test_vila_predictors.py
egork520 Oct 21, 2022
d0f7117
Update tests/test_predictors/test_vila_predictors.py
egork520 Oct 21, 2022
faded77
Update tests/test_predictors/test_vila_predictors.py
egork520 Oct 21, 2022
ac54653
Update tests/test_predictors/test_vila_predictors.py
egork520 Oct 21, 2022
c1b97fd
Update tests/test_predictors/test_figure_table_predictors.py
egork520 Oct 21, 2022
cd06980
Update tests/test_parsers/test_pdf_plumber_parser.py
egork520 Oct 21, 2022
fb00985
Update tests/test_parsers/test_pdf_plumber_parser.py
egork520 Oct 21, 2022
69e1c7b
Update tests/test_parsers/test_pdf_plumber_parser.py
egork520 Oct 21, 2022
55cd37d
Update tests/test_parsers/test_pdf_plumber_parser.py
egork520 Oct 21, 2022
ffa7b56
Rolling in test dependencies into dev
egork520 Oct 21, 2022
c26c46b
Fixing typos
egork520 Oct 21, 2022
5e4b09f
Adding parameters for the coverage
egork520 Oct 21, 2022
45eff40
Specifying percentage of the coverage in the builds
egork520 Oct 21, 2022
641b542
Updating comments about pytest
egork520 Oct 21, 2022
cf0436b
Merge branch 'main' of github.com:allenai/mmda into egork/flake8
egork520 Oct 22, 2022
6ba65aa
First part of lint fixing
egork520 Oct 24, 2022
690c690
Second part of lint fixing
egork520 Oct 24, 2022
7ad0d57
Skip old code
egork520 Oct 24, 2022
0a9786a
Adding flake8 run to the compile step
egork520 Oct 24, 2022
46961d6
Adding a note on how to run the flake8 test
egork520 Oct 24, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/mmda-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ jobs:
- name: Test with Python ${{ matrix.python-version }}
run: |
pip install -e .[dev,pysbd_predictors,hf_predictors]
flake8
pytest --cov-fail-under=42 --ignore=tests/test_predictors/test_vila_predictors.py --ignore=tests/test_predictors/test_figure_table_predictors.py

test_vila_predictors:
Expand Down
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@ conda create -n mmda python=3.8
pip install -e '.[dev,<extras_require section from setup.py>]'
```

## PEP 8 - Style guid for python code
https://peps.python.org/pep-0008/

We propose to follow PEP8 style guide. To test the code run

```bash
flake8
```

## Unit testing
Note that pytest is running coverage, which checks the unit test coverage of the code.
The percent coverage can be found in setup.cfg file.
Expand Down
2 changes: 1 addition & 1 deletion mmda/eval/vlue.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def read_labels(labels_json_path: str) -> list[LabeledDoc]:
list[LabeledDoc]: List of labeled documents
"""
with open(labels_json_path, encoding="utf-8") as f:
labels = [LabeledDoc(**l) for l in json.loads(f.read())]
labels = [LabeledDoc(**line) for line in json.loads(f.read())]

return labels

Expand Down
33 changes: 22 additions & 11 deletions mmda/featurizers/citation_link_featurizers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import pandas as pd
from pydantic import BaseModel
import re
from typing import List, Dict

import pandas as pd
from thefuzz import fuzz
from typing import List, Tuple, Dict

from mmda.types.annotation import SpanGroup


DIGITS = re.compile(r'[0-9]+')
ALPHA = re.compile(r'[A-Za-z]+')
RELEVANT_PUNCTUATION = re.compile(r"\(|\)|\[|,|\]|\.|&|\;")
Expand All @@ -23,6 +22,7 @@
JACCARD_ALPHA = "jaccard_alpha"
MATCH_FIRST_TOKEN = "match_first_token"


class CitationLink:
def __init__(self, mention: SpanGroup, bib: SpanGroup):
self.mention = mention
Expand All @@ -31,6 +31,7 @@ def __init__(self, mention: SpanGroup, bib: SpanGroup):
def to_text_dict(self) -> Dict[str, str]:
return {"source_text": "".join(self.mention.symbols), "target_text": "".join(self.bib.symbols)}


def featurize(possible_links: List[CitationLink]) -> pd.DataFrame:
# create dataframe
df = pd.DataFrame.from_records([link.to_text_dict() for link in possible_links])
Expand All @@ -46,15 +47,16 @@ def featurize(possible_links: List[CitationLink]) -> pd.DataFrame:
df[MATCH_NUMERIC] = df.apply(lambda row: match_numeric(row['source_text'], row['target_text']), axis=1)
df[JACCARD_ALPHA] = df.apply(lambda row: jaccard_alpha(row['source_text'], row['target_text']), axis=1)
df[MATCH_FIRST_TOKEN] = df.apply(lambda row: match_first_token(row['source_text'], row['target_text']), axis=1)

# drop text columns
X_features = df.drop(columns=['source_text', 'target_text'])
return X_features


def ngramify(s: str, n: int) -> List[str]:
s_len = len(s)
return [s[i:i+n] for i in range(s_len-n+1)]
return [s[i:i + n] for i in range(s_len - n + 1)]


def jaccard_ngram(ngrams1: List[str], ngrams2: List[str]) -> float:
if ngrams1 or ngrams2:
Expand All @@ -64,24 +66,28 @@ def jaccard_ngram(ngrams1: List[str], ngrams2: List[str]) -> float:
else:
return 0.0


def jaccardify(source: str, target: str, n: int) -> float:
truncated_target = target[:50]
source_ngrams = ngramify(source, n)
target_ngrams = ngramify(truncated_target, n)
return jaccard_ngram(source_ngrams, target_ngrams)


def has_source_text(source: str) -> int:
if source.strip():
return 1
else:
return 0


def jaccard_numeric(source: str, target: str) -> float:
source_numerics = re.findall(DIGITS, source)
truncated_target = target[:100]
target_numerics = re.findall(DIGITS, truncated_target)
return jaccard_ngram(source_numerics, target_numerics)


def match_numeric(source: str, target: str) -> float:
source_numerics = re.findall(DIGITS, source)
truncated_target = target[:100]
Expand All @@ -90,25 +96,28 @@ def match_numeric(source: str, target: str) -> float:
for number in source_numerics:
found = number in target_numerics
token_found.append(found)

if False not in token_found:
return 1
else:
return 0


def jaccard_alpha(source: str, target: str) -> float:
source_alpha = re.findall(ALPHA, source)
truncated_target = target[:50]
target_alpha = re.findall(ALPHA, truncated_target)
return jaccard_ngram(source_alpha, target_alpha)


# predicts mention/bib entry matches by matching normalized source tokens

# examples returning True:
# source_text = "[78]"
# target_text = "[78]. C. L. Willis and S. L. Miertschin. Mind maps..."
# source_text = "(Wilkinson et al., 2017)"
# target_text = "Wilkinson, R., Quigley, Q., and Marimba, P. Time means nothing. Journal of Far-Fetched Hypotheses, 2017."
# target_text = "Wilkinson, R., Quigley, Q., and Marimba, P. Time means nothing.
# Journal of Far-Fetched Hypotheses, 2017."
#
# examples returning False:
# source_text = "[3]"
Expand All @@ -117,11 +126,13 @@ def jaccard_alpha(source: str, target: str) -> float:
# target_text = "Shi, X. Vanilla Ice Cream Is the Best. Epic Assertions, 2021"

# some failure modes: no source text; source text ranges such as "[13-15]";
# incomplete source text such as ", 2019)"; bib entry text with both item and page numbers
# incomplete source text such as ", 2019)"; bib entry text with both item
# and page numbers
def strip_and_tokenize(text: str) -> List[str]:
stripped_text = RELEVANT_PUNCTUATION.sub("", text)
return stripped_text.lower().strip().split()


def match_source_tokens(source: str, target: str) -> float:
if not source:
return 0
Expand All @@ -133,7 +144,7 @@ def match_source_tokens(source: str, target: str) -> float:
if token != 'et' and token != 'al' and token != 'and':
found = token in target_tokens
token_found.append(found)

if False not in token_found:
return 1
else:
Expand All @@ -152,4 +163,4 @@ def match_first_token(source: str, target: str) -> float:
if first_source_token in target_tokens:
return 1
else:
return 0
return 0
2 changes: 1 addition & 1 deletion mmda/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

__all__ = [
'PDFPlumberParser'
]
]
5 changes: 2 additions & 3 deletions mmda/parsers/grobid_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@
import os
import io
import xml.etree.ElementTree as et
from typing import List, Optional, Text
from typing import List, Optional
import requests
import tempfile
import json

from mmda.parsers.parser import Parser
from mmda.types.annotation import SpanGroup
Expand Down Expand Up @@ -54,6 +52,7 @@ def _post_document(url: str, input_pdf_path: str) -> str:

return req.text


class GrobidHeaderParser(Parser):
"""Grobid parser that uses header API methods to get title and abstract only. The
current purpose of this class is evaluation against other methods for title and
Expand Down
2 changes: 1 addition & 1 deletion mmda/parsers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"""

from abc import abstractmethod
from typing import List, Optional, Protocol, Union
from typing import Protocol

from mmda.types.document import Document

Expand Down
22 changes: 5 additions & 17 deletions mmda/parsers/pdfplumber_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from mmda.types.annotation import SpanGroup
from mmda.types.document import Document
from mmda.parsers.parser import Parser
from mmda.types.names import *
from mmda.types.names import Symbols, Pages, Tokens, Rows


class PDFPlumberParser(Parser):
Expand Down Expand Up @@ -288,7 +288,7 @@ def _simple_line_detection(
Adapted from https://github.com/allenai/VILA/blob/e6d16afbd1832f44a430074855fbb4c3d3604f4a/src/vila/pdftools/pdfplumber_extractor.py#L24

Modified Oct 2022 (kylel): Changed return value to be List[int]
"""
""" # noqa
Copy link
Member

@soldni soldni Oct 24, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#noqa is a blanket ignore, we should use specific errors ignore instead.

prev_y = None
prev_x = None

Expand Down Expand Up @@ -333,9 +333,9 @@ def _align_coarse_and_fine_tokens(
"""Returns a list of length len(fine_tokens) where elements of the list are
integer indices into coarse_tokens elements."""
assert len(coarse_tokens) <= len(fine_tokens), \
f"This method requires |coarse| <= |fine|"
"This method requires |coarse| <= |fine|"
assert ''.join(coarse_tokens) == ''.join(fine_tokens), \
f"This method requires the chars(coarse) == chars(fine)"
"This method requires the chars(coarse) == chars(fine)"

coarse_start_ends = []
start = 0
Expand Down Expand Up @@ -366,19 +366,7 @@ def _align_coarse_and_fine_tokens(
return out









"""





row_annos.append(row)
current_rows_tokens = []

Expand All @@ -400,4 +388,4 @@ def _align_coarse_and_fine_tokens(
page_annos.append(page)
current_pages_tokens = []

"""
""" # noqa
37 changes: 20 additions & 17 deletions mmda/parsers/symbol_scraper_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

"""
import os
import json
import logging
import math
import re
Expand All @@ -21,8 +20,7 @@
from mmda.types.annotation import SpanGroup
from mmda.types.document import Document
from mmda.parsers.parser import Parser
from mmda.types.names import *

from mmda.types.names import Symbols, Pages, Tokens, Rows

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -109,7 +107,8 @@ def _find_one_and_extract(self, my_list: List[str],
return None

def _parse_row_head_tag(self, row_tag: str) -> Dict:
# TODO - not sure why line bboxes are useful; skip for now. they dont quite make sense (e.g. bbox[1] == bbox[3])
# TODO - not sure why line bboxes are useful; skip for now. they dont quite make sense
# (e.g. bbox[1] == bbox[3])
match = re.match(pattern=r'<Line id=\"([0-9]+)\" BBOX=\"(.+)\">', string=row_tag)
return {'id': int(match.group(1)), 'bbox': match.group(2)}

Expand All @@ -132,14 +131,19 @@ def _parse_page_to_metrics(self, xml_lines: List[str]) -> Dict:
pagemetrics = xml_lines[start:end]

page_to_metrics = {}
for start, end in self._split_list_by_start_end_tags(my_list=pagemetrics, start_tag='<page>', end_tag='</page>'):
for start, end in self._split_list_by_start_end_tags(
my_list=pagemetrics, start_tag='<page>', end_tag='</page>'):
partition = pagemetrics[start:end]
page_num = int(self._find_one_and_extract(my_list=partition, start_tag='<no>', end_tag='</no>'))
page_width = float(self._find_one_and_extract(my_list=partition, start_tag='<pagewidth>', end_tag='</pagewidth>'))
page_height = float(self._find_one_and_extract(my_list=partition, start_tag='<pageheight>', end_tag='</pageheight>'))
page_width = float(self._find_one_and_extract(
my_list=partition, start_tag='<pagewidth>', end_tag='</pagewidth>'))
page_height = float(self._find_one_and_extract(
my_list=partition, start_tag='<pageheight>', end_tag='</pageheight>'))
page_num_rows = int(self._find_one_and_extract(my_list=partition, start_tag='<lines>', end_tag='</lines>'))
page_num_tokens = int(self._find_one_and_extract(my_list=partition, start_tag='<words>', end_tag='</words>'))
page_num_chars = int(self._find_one_and_extract(my_list=partition, start_tag='<characters>', end_tag='</characters>'))
page_num_tokens = int(self._find_one_and_extract(
my_list=partition, start_tag='<words>', end_tag='</words>'))
page_num_chars = int(self._find_one_and_extract(
my_list=partition, start_tag='<characters>', end_tag='</characters>'))
page_to_metrics[page_num] = {
'height': page_height,
'width': page_width,
Expand All @@ -163,10 +167,9 @@ def _parse_page_to_row_to_tokens(self, xml_lines: List[str], page_to_metrics: Di
row_info = self._parse_row_head_tag(row_tag=row_lines[0]) # first line is the head tag
row_id = row_info['id']
for token_start, token_end in self._split_list_by_start_end_tags(my_list=row_lines,
start_tag='<Word',
end_tag='</Word>'):
start_tag='<Word',
end_tag='</Word>'):
token_lines = row_lines[token_start:token_end]
token_info = self._parse_token_head_tag(token_tag=token_lines[0]) # first line is the head tag
char_bboxes: List[Box] = []
token = ''
for char_tag in [t for t in token_lines if t.startswith('<Char') and t.endswith('</Char>')]:
Expand Down Expand Up @@ -216,7 +219,7 @@ def _convert_nested_text_to_doc_json(self, page_to_row_to_tokens: Dict) -> Dict:
if k < len(tokens) - 1:
text += ' '
else:
text += '\n' # start newline at end of row
text += '\n' # start newline at end of row
start = end + 1
# make row
row = SpanGroup(spans=[
Expand Down Expand Up @@ -265,8 +268,10 @@ def _parse_xml_to_doc(self, xmlfile: str) -> Document:
# get page metrics
page_to_metrics = self._parse_page_to_metrics(xml_lines=xml_lines)
logger.info(f'\tNum pages: {len(page_to_metrics)}')
logger.info(f"\tAvg tokens: {sum([metric['tokens'] for metric in page_to_metrics.values()]) / len(page_to_metrics)}")
logger.info(f"\tAvg rows: {sum([metric['rows'] for metric in page_to_metrics.values()]) / len(page_to_metrics)}")
logger.info(
f"\tAvg tokens: {sum([metric['tokens'] for metric in page_to_metrics.values()]) / len(page_to_metrics)}")
logger.info(
f"\tAvg rows: {sum([metric['rows'] for metric in page_to_metrics.values()]) / len(page_to_metrics)}")

# get token stream (grouped by page & row)
page_to_row_to_tokens = self._parse_page_to_row_to_tokens(xml_lines=xml_lines, page_to_metrics=page_to_metrics)
Expand All @@ -277,5 +282,3 @@ def _parse_xml_to_doc(self, xmlfile: str) -> Document:
# build Document
doc = Document.from_json(doc_dict=doc_dict)
return doc


Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@


class BaseHeuristicPredictor(BasePredictor):
REQUIRED_BACKENDS = []
REQUIRED_BACKENDS = []
Loading