allenai · egork520 · Oct 19, 2022 · Oct 19, 2022 · Oct 19, 2022 · Oct 20, 2022
diff --git a/.github/workflows/mmda-ci.yml b/.github/workflows/mmda-ci.yml
@@ -24,6 +24,7 @@ jobs:
       - name: Test with Python ${{ matrix.python-version }}
         run: |
           pip install -e .[dev,pysbd_predictors,hf_predictors]
+          flake8
           pytest --cov-fail-under=42 --ignore=tests/test_predictors/test_vila_predictors.py --ignore=tests/test_predictors/test_figure_table_predictors.py
 
   test_vila_predictors:

diff --git a/README.md b/README.md
@@ -9,6 +9,15 @@ conda create -n mmda python=3.8
 pip install -e '.[dev,<extras_require section from setup.py>]'
 ```
 
+## PEP 8 - Style guid for python code
+https://peps.python.org/pep-0008/
+
+We propose to follow PEP8 style guide. To test the code run
+
+```bash
+flake8
+```
+
 ## Unit testing
 Note that pytest is running coverage, which checks the unit test coverage of the code.
 The percent coverage can be found in setup.cfg file.

diff --git a/mmda/eval/vlue.py b/mmda/eval/vlue.py
@@ -78,7 +78,7 @@ def read_labels(labels_json_path: str) -> list[LabeledDoc]:
         list[LabeledDoc]: List of labeled documents
     """
     with open(labels_json_path, encoding="utf-8") as f:
-        labels = [LabeledDoc(**l) for l in json.loads(f.read())]
+        labels = [LabeledDoc(**line) for line in json.loads(f.read())]
 
     return labels
 

diff --git a/mmda/featurizers/citation_link_featurizers.py b/mmda/featurizers/citation_link_featurizers.py
@@ -1,12 +1,11 @@
-import pandas as pd
-from pydantic import BaseModel
 import re
+from typing import List, Dict
+
+import pandas as pd
 from thefuzz import fuzz
-from typing import List, Tuple, Dict
 
 from mmda.types.annotation import SpanGroup
 
-
 DIGITS = re.compile(r'[0-9]+')
 ALPHA = re.compile(r'[A-Za-z]+')
 RELEVANT_PUNCTUATION = re.compile(r"\(|\)|\[|,|\]|\.|&|\;")
@@ -23,6 +22,7 @@
 JACCARD_ALPHA = "jaccard_alpha"
 MATCH_FIRST_TOKEN = "match_first_token"
 
+
 class CitationLink:
     def __init__(self, mention: SpanGroup, bib: SpanGroup):
         self.mention = mention
@@ -31,6 +31,7 @@ def __init__(self, mention: SpanGroup, bib: SpanGroup):
     def to_text_dict(self) -> Dict[str, str]:
         return {"source_text": "".join(self.mention.symbols), "target_text": "".join(self.bib.symbols)}
 
+
 def featurize(possible_links: List[CitationLink]) -> pd.DataFrame:
     # create dataframe
     df = pd.DataFrame.from_records([link.to_text_dict() for link in possible_links])
@@ -46,15 +47,16 @@ def featurize(possible_links: List[CitationLink]) -> pd.DataFrame:
     df[MATCH_NUMERIC] = df.apply(lambda row: match_numeric(row['source_text'], row['target_text']), axis=1)
     df[JACCARD_ALPHA] = df.apply(lambda row: jaccard_alpha(row['source_text'], row['target_text']), axis=1)
     df[MATCH_FIRST_TOKEN] = df.apply(lambda row: match_first_token(row['source_text'], row['target_text']), axis=1)
-    
+
     # drop text columns
     X_features = df.drop(columns=['source_text', 'target_text'])
     return X_features
 
 
 def ngramify(s: str, n: int) -> List[str]:
     s_len = len(s)
-    return [s[i:i+n] for i in range(s_len-n+1)]
+    return [s[i:i + n] for i in range(s_len - n + 1)]
+
 
 def jaccard_ngram(ngrams1: List[str], ngrams2: List[str]) -> float:
     if ngrams1 or ngrams2:
@@ -64,24 +66,28 @@ def jaccard_ngram(ngrams1: List[str], ngrams2: List[str]) -> float:
     else:
         return 0.0
 
+
 def jaccardify(source: str, target: str, n: int) -> float:
     truncated_target = target[:50]
     source_ngrams = ngramify(source, n)
     target_ngrams = ngramify(truncated_target, n)
     return jaccard_ngram(source_ngrams, target_ngrams)
 
+
 def has_source_text(source: str) -> int:
     if source.strip():
         return 1
     else:
         return 0
 
+
 def jaccard_numeric(source: str, target: str) -> float:
     source_numerics = re.findall(DIGITS, source)
     truncated_target = target[:100]
     target_numerics = re.findall(DIGITS, truncated_target)
     return jaccard_ngram(source_numerics, target_numerics)
 
+
 def match_numeric(source: str, target: str) -> float:
     source_numerics = re.findall(DIGITS, source)
     truncated_target = target[:100]
@@ -90,25 +96,28 @@ def match_numeric(source: str, target: str) -> float:
     for number in source_numerics:
         found = number in target_numerics
         token_found.append(found)
-    
+
     if False not in token_found:
         return 1
     else:
         return 0
 
+
 def jaccard_alpha(source: str, target: str) -> float:
     source_alpha = re.findall(ALPHA, source)
     truncated_target = target[:50]
     target_alpha = re.findall(ALPHA, truncated_target)
     return jaccard_ngram(source_alpha, target_alpha)
 
+
 # predicts mention/bib entry matches by matching normalized source tokens
 
 # examples returning True:
 # source_text = "[78]"
 # target_text = "[78]. C. L. Willis and S. L. Miertschin. Mind maps..."
 # source_text = "(Wilkinson et al., 2017)"
-# target_text = "Wilkinson, R., Quigley, Q., and Marimba, P. Time means nothing. Journal of Far-Fetched Hypotheses, 2017."
+# target_text = "Wilkinson, R., Quigley, Q., and Marimba, P. Time means nothing.
+# Journal of Far-Fetched Hypotheses, 2017."
 #
 # examples returning False:
 # source_text = "[3]"
@@ -117,11 +126,13 @@ def jaccard_alpha(source: str, target: str) -> float:
 # target_text = "Shi, X. Vanilla Ice Cream Is the Best. Epic Assertions, 2021"
 
 # some failure modes: no source text; source text ranges such as "[13-15]";
-# incomplete source text such as ", 2019)"; bib entry text with both item and page numbers
+# incomplete source text such as ", 2019)"; bib entry text with both item
+# and page numbers
 def strip_and_tokenize(text: str) -> List[str]:
     stripped_text = RELEVANT_PUNCTUATION.sub("", text)
     return stripped_text.lower().strip().split()
 
+
 def match_source_tokens(source: str, target: str) -> float:
     if not source:
         return 0
@@ -133,7 +144,7 @@ def match_source_tokens(source: str, target: str) -> float:
             if token != 'et' and token != 'al' and token != 'and':
                 found = token in target_tokens
                 token_found.append(found)
-        
+
         if False not in token_found:
             return 1
         else:
@@ -152,4 +163,4 @@ def match_first_token(source: str, target: str) -> float:
         if first_source_token in target_tokens:
             return 1
         else:
-            return 0
+            return 0
diff --git a/mmda/parsers/__init__.py b/mmda/parsers/__init__.py
@@ -2,4 +2,4 @@
 
 __all__ = [
     'PDFPlumberParser'
-]
+]
diff --git a/mmda/parsers/grobid_parser.py b/mmda/parsers/grobid_parser.py
@@ -7,10 +7,8 @@
 import os
 import io
 import xml.etree.ElementTree as et
-from typing import List, Optional, Text
+from typing import List, Optional
 import requests
-import tempfile
-import json
 
 from mmda.parsers.parser import Parser
 from mmda.types.annotation import SpanGroup
@@ -54,6 +52,7 @@ def _post_document(url: str, input_pdf_path: str) -> str:
 
     return req.text
 
+
 class GrobidHeaderParser(Parser):
     """Grobid parser that uses header API methods to get title and abstract only. The
     current purpose of this class is evaluation against other methods for title and

diff --git a/mmda/parsers/parser.py b/mmda/parsers/parser.py
@@ -7,7 +7,7 @@
 """
 
 from abc import abstractmethod
-from typing import List, Optional, Protocol, Union
+from typing import Protocol
 
 from mmda.types.document import Document
 

diff --git a/mmda/parsers/pdfplumber_parser.py b/mmda/parsers/pdfplumber_parser.py
@@ -9,7 +9,7 @@
 from mmda.types.annotation import SpanGroup
 from mmda.types.document import Document
 from mmda.parsers.parser import Parser
-from mmda.types.names import *
+from mmda.types.names import Symbols, Pages, Tokens, Rows
 
 
 class PDFPlumberParser(Parser):
@@ -288,7 +288,7 @@ def _simple_line_detection(
         Adapted from https://github.com/allenai/VILA/blob/e6d16afbd1832f44a430074855fbb4c3d3604f4a/src/vila/pdftools/pdfplumber_extractor.py#L24
 
         Modified Oct 2022 (kylel): Changed return value to be List[int]
-        """
+        """ # noqa
         prev_y = None
         prev_x = None
 
@@ -333,9 +333,9 @@ def _align_coarse_and_fine_tokens(
         """Returns a list of length len(fine_tokens) where elements of the list are
         integer indices into coarse_tokens elements."""
         assert len(coarse_tokens) <= len(fine_tokens), \
-            f"This method requires |coarse| <= |fine|"
+            "This method requires |coarse| <= |fine|"
         assert ''.join(coarse_tokens) == ''.join(fine_tokens), \
-            f"This method requires the chars(coarse) == chars(fine)"
+            "This method requires the chars(coarse) == chars(fine)"
 
         coarse_start_ends = []
         start = 0
@@ -366,19 +366,7 @@ def _align_coarse_and_fine_tokens(
         return out
 
 
-
-
-
-
-
-
-
 """
-
-
-
-
-
                 row_annos.append(row)
                 current_rows_tokens = []
 
@@ -400,4 +388,4 @@ def _align_coarse_and_fine_tokens(
                     page_annos.append(page)
                     current_pages_tokens = []
 
-"""
+""" # noqa
diff --git a/mmda/parsers/symbol_scraper_parser.py b/mmda/parsers/symbol_scraper_parser.py
@@ -6,7 +6,6 @@
 
 """
 import os
-import json
 import logging
 import math
 import re
@@ -21,8 +20,7 @@
 from mmda.types.annotation import SpanGroup
 from mmda.types.document import Document
 from mmda.parsers.parser import Parser
-from mmda.types.names import *
-
+from mmda.types.names import Symbols, Pages, Tokens, Rows
 
 logger = logging.getLogger(__name__)
 
@@ -109,7 +107,8 @@ def _find_one_and_extract(self, my_list: List[str],
         return None
 
     def _parse_row_head_tag(self, row_tag: str) -> Dict:
-        # TODO - not sure why line bboxes are useful; skip for now.  they dont quite make sense (e.g. bbox[1] == bbox[3])
+        # TODO - not sure why line bboxes are useful; skip for now.  they dont quite make sense
+        #  (e.g. bbox[1] == bbox[3])
         match = re.match(pattern=r'<Line id=\"([0-9]+)\" BBOX=\"(.+)\">', string=row_tag)
         return {'id': int(match.group(1)), 'bbox': match.group(2)}
 
@@ -132,14 +131,19 @@ def _parse_page_to_metrics(self, xml_lines: List[str]) -> Dict:
         pagemetrics = xml_lines[start:end]
 
         page_to_metrics = {}
-        for start, end in self._split_list_by_start_end_tags(my_list=pagemetrics, start_tag='<page>', end_tag='</page>'):
+        for start, end in self._split_list_by_start_end_tags(
+                my_list=pagemetrics, start_tag='<page>', end_tag='</page>'):
             partition = pagemetrics[start:end]
             page_num = int(self._find_one_and_extract(my_list=partition, start_tag='<no>', end_tag='</no>'))
-            page_width = float(self._find_one_and_extract(my_list=partition, start_tag='<pagewidth>', end_tag='</pagewidth>'))
-            page_height = float(self._find_one_and_extract(my_list=partition, start_tag='<pageheight>', end_tag='</pageheight>'))
+            page_width = float(self._find_one_and_extract(
+                my_list=partition, start_tag='<pagewidth>', end_tag='</pagewidth>'))
+            page_height = float(self._find_one_and_extract(
+                my_list=partition, start_tag='<pageheight>', end_tag='</pageheight>'))
             page_num_rows = int(self._find_one_and_extract(my_list=partition, start_tag='<lines>', end_tag='</lines>'))
-            page_num_tokens = int(self._find_one_and_extract(my_list=partition, start_tag='<words>', end_tag='</words>'))
-            page_num_chars = int(self._find_one_and_extract(my_list=partition, start_tag='<characters>', end_tag='</characters>'))
+            page_num_tokens = int(self._find_one_and_extract(
+                my_list=partition, start_tag='<words>', end_tag='</words>'))
+            page_num_chars = int(self._find_one_and_extract(
+                my_list=partition, start_tag='<characters>', end_tag='</characters>'))
             page_to_metrics[page_num] = {
                 'height': page_height,
                 'width': page_width,
@@ -163,10 +167,9 @@ def _parse_page_to_row_to_tokens(self, xml_lines: List[str], page_to_metrics: Di
                 row_info = self._parse_row_head_tag(row_tag=row_lines[0])  # first line is the head tag
                 row_id = row_info['id']
                 for token_start, token_end in self._split_list_by_start_end_tags(my_list=row_lines,
-                                                                               start_tag='<Word',
-                                                                               end_tag='</Word>'):
+                                                                                 start_tag='<Word',
+                                                                                 end_tag='</Word>'):
                     token_lines = row_lines[token_start:token_end]
-                    token_info = self._parse_token_head_tag(token_tag=token_lines[0])  # first line is the head tag
                     char_bboxes: List[Box] = []
                     token = ''
                     for char_tag in [t for t in token_lines if t.startswith('<Char') and t.endswith('</Char>')]:
@@ -216,7 +219,7 @@ def _convert_nested_text_to_doc_json(self, page_to_row_to_tokens: Dict) -> Dict:
                     if k < len(tokens) - 1:
                         text += ' '
                     else:
-                        text += '\n'    # start newline at end of row
+                        text += '\n'  # start newline at end of row
                     start = end + 1
                 # make row
                 row = SpanGroup(spans=[
@@ -265,8 +268,10 @@ def _parse_xml_to_doc(self, xmlfile: str) -> Document:
         # get page metrics
         page_to_metrics = self._parse_page_to_metrics(xml_lines=xml_lines)
         logger.info(f'\tNum pages: {len(page_to_metrics)}')
-        logger.info(f"\tAvg tokens: {sum([metric['tokens'] for metric in page_to_metrics.values()]) / len(page_to_metrics)}")
-        logger.info(f"\tAvg rows: {sum([metric['rows'] for metric in page_to_metrics.values()]) / len(page_to_metrics)}")
+        logger.info(
+            f"\tAvg tokens: {sum([metric['tokens'] for metric in page_to_metrics.values()]) / len(page_to_metrics)}")
+        logger.info(
+            f"\tAvg rows: {sum([metric['rows'] for metric in page_to_metrics.values()]) / len(page_to_metrics)}")
 
         # get token stream (grouped by page & row)
         page_to_row_to_tokens = self._parse_page_to_row_to_tokens(xml_lines=xml_lines, page_to_metrics=page_to_metrics)
@@ -277,5 +282,3 @@ def _parse_xml_to_doc(self, xmlfile: str) -> Document:
         # build Document
         doc = Document.from_json(doc_dict=doc_dict)
         return doc
-
-
diff --git a/mmda/predictors/base_predictors/base_heuristic_predictor.py b/mmda/predictors/base_predictors/base_heuristic_predictor.py
@@ -2,4 +2,4 @@
 
 
 class BaseHeuristicPredictor(BasePredictor):
-    REQUIRED_BACKENDS = []
+    REQUIRED_BACKENDS = []