Skip to content

Commit 3a8afa3

Browse files
Merge pull request #6 from wellcometrust/feature/ivyleavedtoadflax/add_parsing
Prepare package for training parsing models
2 parents dc40e8d + 521cfeb commit 3a8afa3

File tree

5 files changed

+21
-12
lines changed

5 files changed

+21
-12
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ deep_reference_parser/models/
99
*.whl
1010
embeddings/
1111
models/
12-
.tox/
12+
.tox/
13+
*__pycache__/

Makefile

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,12 @@ $(artefacts):
7878
models: $(artefacts)
7979

8080

81-
datasets = data/2019.12.0_train.tsv \
82-
data/2019.12.0_test.tsv \
83-
data/2019.12.0_valid.tsv
81+
datasets = data/splitting/2019.12.0_splitting_train.tsv \
82+
data/splitting/2019.12.0_splitting_test.tsv \
83+
data/splitting/2019.12.0_splitting_valid.tsv \
84+
data/splitting/2020.2.0_parsing_train.tsv \
85+
data/splitting/2020.2.0_parsing_test.tsv \
86+
data/splitting/2020.2.0_parsing_valid.tsv
8487

8588

8689
rodrigues_datasets = data/rodrigues/clean_train.txt \
@@ -90,7 +93,7 @@ rodrigues_datasets = data/rodrigues/clean_train.txt \
9093
RODRIGUES_DATA_URL = https://github.com/dhlab-epfl/LinkedBooksDeepReferenceParsing/raw/master/dataset/
9194

9295
$(datasets):
93-
@ mkdir -p data
96+
@ mkdir -p $(@D)
9497
curl -s $(S3_BUCKET_HTTP)/$@ --output $@
9598

9699
$(rodrigues_datasets):

deep_reference_parser/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@
55
__author__ = "Wellcome Trust DataLabs Team"
66
__author_email__ = "[email protected]"
77
__license__ = "MIT"
8-
__model_version__ = "2019.12.0"
8+
__model_version__ = "2019.12.0_splitting"

deep_reference_parser/configs/2019.12.0.ini renamed to deep_reference_parser/configs/2019.12.0_splitting.ini

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[DEFAULT]
2-
version = 2019.12.0
2+
version = 2019.12.0_splitting
33

44
[data]
55
test_proportion = 0.25
@@ -8,13 +8,13 @@ data_path = data/
88
respect_line_endings = 0
99
respect_doc_endings = 1
1010
line_limit = 250
11-
policy_train = data/2019.12.0_train.tsv
12-
policy_test = data/2019.12.0_test.tsv
13-
policy_valid = data/2019.12.0_valid.tsv
11+
policy_train = data/splitting/2019.12.0_splitting_train.tsv
12+
policy_test = data/splitting/2019.12.0_splitting_test.tsv
13+
policy_valid = data/splitting/2019.12.0_splitting_valid.tsv
1414
s3_slug = https://datalabs-public.s3.eu-west-2.amazonaws.com/deep_reference_parser/
1515

1616
[build]
17-
output_path = models/2019.12.0/
17+
output_path = models/splitting/2019.12.0_splitting/
1818
output = crf
1919
word_embeddings = embeddings/2020.1.1-wellcome-embeddings-300.txt
2020
pretrained_embedding = 0

deep_reference_parser/prodigy/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,7 @@
1+
from .numbered_reference_annotator import (NumberedReferenceAnnotator,
2+
annotate_numbered_references)
3+
from .prodigy_to_tsv import TokenLabelPairs, prodigy_to_tsv
4+
from .reach_to_prodigy import ReachToProdigy, reach_to_prodigy
5+
from .reference_to_token_annotations import (TokenTagger,
6+
reference_to_token_annotations)
17
from .spacy_doc_to_prodigy import SpacyDocToProdigy
2-
from .reference_to_token_annotations import TokenTagger

0 commit comments

Comments
 (0)