-
Notifications
You must be signed in to change notification settings - Fork 1
/
project.yml
175 lines (156 loc) · 6.53 KB
/
project.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
title: "NER Citations of ECFR Banking Regulation in a spaCy pipeline."
description: "Custom NER project for spaCy v3 adapted from the spaCy v3 [`ner_demo`](https://github.com/explosion/projects/tree/9d5fce5f95ddf5f35c3370b2074b25e995525f51/pipelines/ner_demo) example script for creating an NER component in a new pipeline."
# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
lang: "en"
train: corpus/train.spacy
dev: corpus/dev.spacy
version: "0.1.0"
# Set your GPU ID, -1 is CPU
gpu_id: -1
# Vectors model for train-with-vectors
vectors_model: "en_core_web_lg"
name: ecfr_ner
prodigy:
ner_labels: ecfr_initial_ner
ner_manual_labels: ecfr_manual_ner
senter_labels: ecfr_labeled_sents
ner_labeled_dataset: ecfr_labeled_ner
assets:
ner_labels: assets/ecfr_ner_labels.jsonl
senter_labels: assets/ecfr_senter_labels.jsonl
ner_patterns: assets/patterns.jsonl
raw_files:
ner_ecfr_raw: assets/raw-files/ecfr-title-12-sent.jsonl
ner_ecfr_shuffle: assets/raw-files/ecfr-title-12-sent-shuffle.jsonl
# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["assets", "corpus", "configs", "training", "scripts"]
# Assets that should be downloaded or available in the directory. We're shipping
# them with the project, so they won't have to be downloaded.
assets:
- dest: "assets/ecfr_ner_labels.jsonl"
description: "400 initial NER labels of sections, cites, and laws"
- dest: "assets/patterns.jsonl"
description: "Patterns for sections, cites, and laws for initial NER training"
- dest: "assets/ecfr_senter_labels.jsonl"
description: "150 initial sentence segmentations of eCFR sub-sections"
- dest: "assets/raw-files/ecfr-sample-sents.jsonl"
description: "Sample of Prodigy annotated sentences from ecfr-sample-title-12.jsonl file"
- dest: "assets/raw-files/ecfr-sample-title-12.jsonl"
description: "Sample of 47 records (sub-sections) from ecfr-title-12.jsonl"
- dest: "assets/raw-files/ecfr-title-12.jsonl"
description: "eCFR Title 12 (Banking) parsed as a jsonl file"
- dest: "assets/raw-files/ecfr-title-12-sent.jsonl"
description: "Senter scored model segmenting ecfr-title-12.jsonl"
# Workflows are sequences of commands (see below) executed in order. You can
# run them via "spacy project run [workflow]". If a commands's inputs/outputs
# haven't changed, it won't be re-run.
workflows:
all:
- download
- train
- evaluate
- package
# Project commands, specified in a style similar to CI config files (e.g. Azure
# pipelines). The name is the command name that lets you trigger the command
# via "spacy project run [command] [path]". The help message is optional and
# shown when executing "spacy project run [optional command] [path] --help".
commands:
- name: "download"
help: "Download a spaCy model with pretrained vectors"
script:
- "python -m spacy download ${vars.vectors_model}"
- name: "data-to-spacy"
help: "Merge your annotations and create data in spaCy's binary format"
script:
- "python -m prodigy data-to-spacy corpus/ --senter ${vars.prodigy.senter_labels} --ner ${vars.prodigy.ner_labeled_dataset}"
outputs:
- "corpus/train.spacy"
- "corpus/dev.spacy"
- name: "data-to-asset-senter"
help: "Export senter annotations to assets"
script:
- "python -m prodigy db-out ${vars.prodigy.senter_labels} ${vars.assets.senter_labels}"
outputs:
- "assets"
- name: "train-curve-ner"
help: "Train curve for NER"
script:
- "python -m prodigy train-curve --ner ${vars.prodigy.ner_labeled_dataset}"
- name: "data-to-asset-ner"
help: "Export NER annotations to assets"
script:
- "python -m prodigy db-out ${vars.prodigy.ner_labels} ${vars.assets.ner_labels}"
outputs:
- "assets"
- name: "train"
help: "Train pipeline models"
script:
- "python -m spacy train configs/config.cfg --output training/ --gpu-id ${vars.gpu_id} --initialize.vectors ${vars.vectors_model} --components.tok2vec.model.embed.include_static_vectors true"
deps:
- "configs/config.cfg"
- "corpus/train.spacy"
- "corpus/dev.spacy"
outputs:
- "training/model-best"
- name: "evaluate"
help: "Evaluate the model and export metrics"
script:
- "python -m spacy evaluate training/model-best ${vars.dev} --output training/metrics.json"
deps:
- "corpus/dev.spacy"
- "training/model-best"
outputs:
- "training/metrics.json"
- name: "prodigy-al-ner"
help: "NER prodigy active learning annotaitons"
script:
- "python -m prodigy ner.teach ${vars.prodigy.ner_labeled_dataset} training/model-best ${vars.raw_files.ner_ecfr_shuffle} --label SECTION,LAW,CITE,PARAGRAPH,TITLE,SUBTITLE --patterns ${vars.assets.ner_patterns}"
deps:
- "training/model-best"
- name: "prodigy-manual-ner"
help: "NER prodigy manual learning annotations"
script:
- "python -m prodigy ner.manual ${vars.prodigy.ner_manual_labels} training/model-best ${vars.raw_files.ner_ecfr_shuffle} --label SECTION,LAW,CITE,PARAGRAPH,TITLE,SUBTITLE --patterns ${vars.assets.ner_patterns}"
deps:
- "training/model-best"
# - name: "shuffle"
# help: "Shuffle jsonl files for Prodigy"
# script:
# -"python -m scripts/shuffle.py ${vars.raw_files.ner_ecfr_raw} ${vars.raw_files.ner_ecfr_shuffle}"
# deps:
# - "${vars.raw_files.ner_ecfr_raw}"
# outputs:
# - "${vars.raw_files.ner_ecfr_shuffle}"
- name: package
help: "Package the trained model as a pip package"
script:
- "python -m spacy package training/model-best packages --name ${vars.name} --version ${vars.version} --force"
deps:
- "training/model-best"
outputs_no_cache:
- "packages/${vars.lang}_${vars.name}-${vars.version}/dist/${vars.lang}_${vars.name}-${vars.version}.tar.gz"
- name: visualize-model
help: Visualize the model's output interactively using Streamlit
script:
- "streamlit run scripts/visualize_model.py training/model-best"
deps:
- "scripts/visualize_model.py"
- "training/model-best"
- name: setup
help: Install dependencies
script:
- "python -m pip install -r requirements.txt"
deps:
- "requirements.txt"
- name: clean
help: "Remove intermediate files"
script:
- "rm -rf training"
- "rm -rf corpus"
- "rm -rf packages"
- name: document
help: "Export README for project details"
script:
- "spacy project document --output README.md"