diff --git a/.github/ISSUE_TEMPLATE/general-report.md b/.github/ISSUE_TEMPLATE/general-report.md deleted file mode 100644 index 85ac8935b5..0000000000 --- a/.github/ISSUE_TEMPLATE/general-report.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -name: General report -about: Create a report to help us improve -title: '' -labels: '' -assignees: '' ---- - -Before creating a new issue, make sure you had a look at the [official documentation](https://grobid.readthedocs.com). For specific questions, you can try the [Mendable Q/A chat](https://www.mendable.ai/demo/723cfc12-fdd6-4631-9a9e-21b80241131b) (**NOTE**: This is rather experimental, if not sure, make sure you double-check using the official documentation.) - -- What is your OS and architecture? Windows is not supported and Mac OS arm64 is experimentally supported. For non-supported OS, you can use Docker (https://grobid.readthedocs.io/en/latest/Grobid-docker/) - -- What is your Java version (`java --version`)? - -- In case of build or run errors, please submit the error while running gradlew with ``--stacktrace`` and ``--info`` for better log traces (e.g. `./gradlew run --stacktrace --info`) or attach the log file `logs/grobid-service.log` or the console log. diff --git a/.github/ISSUE_TEMPLATE/general-report.yml b/.github/ISSUE_TEMPLATE/general-report.yml new file mode 100644 index 0000000000..d12ad8fdd6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/general-report.yml @@ -0,0 +1,36 @@ +name: General report +description: Create a report to help us improve +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! Before creating a new issue, make sure you had a look at the [official documentation](https://grobid.readthedocs.com) or with the **experimental** [Mendable Q/A chat](https://www.mendable.ai/demo/723cfc12-fdd6-4631-9a9e-21b80241131b). **NOTE**: the suggested method of running grobid is through Docker (https://grobid.readthedocs.io/en/latest/Grobid-docker/). + - type: input + id: os + attributes: + label: Operating System and architecture (arm64, amd64, x86, etc.) + description: Please remember that Windows is not supported and Mac OS arm64 is still experimental. + validations: + required: false + - type: input + id: java + attributes: + label: What is your Java version + description: "java --version" + validations: + required: false + - type: textarea + id: logs + attributes: + label: Log and information + description: In case of build or run errors, please submit the error while running gradlew with ``--stacktrace`` and ``--info`` for better log traces (e.g. `./gradlew run --stacktrace --info`) or attach the log file `logs/grobid-service.log` or the console log. + validations: + required: false + - type: textarea + id: what-happened + attributes: + label: Further information + description: Please give us any information that could be of help + validations: + required: false + diff --git a/.github/workflows/ci-build-manual-crf.yml b/.github/workflows/ci-build-manual-crf.yml index 6606f0bfd1..d794f4d663 100644 --- a/.github/workflows/ci-build-manual-crf.yml +++ b/.github/workflows/ci-build-manual-crf.yml @@ -3,10 +3,11 @@ name: Build and push a CRF-only docker image on: workflow_dispatch: inputs: - suffix: + custom_tag: type: string - description: Docker image suffix (e.g. develop, crf, full) - required: false + description: Docker image tag + required: true + default: "latest-crf" jobs: build: @@ -42,6 +43,6 @@ jobs: registry: docker.io pushImage: true tags: | - latest-develop, latest-crf${{ github.event.inputs.suffix != '' && '-' || '' }}${{ github.event.inputs.suffix }} + latest-develop, ${{ github.event.inputs.custom_tag}} - name: Image digest run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/ci-build-manual-full.yml b/.github/workflows/ci-build-manual-full.yml index ce1a0a175b..9ba70d16dc 100644 --- a/.github/workflows/ci-build-manual-full.yml +++ b/.github/workflows/ci-build-manual-full.yml @@ -1,7 +1,13 @@ name: Build and push a full docker image -on: "workflow_dispatch" - +on: + workflow_dispatch: + inputs: + custom_tag: + type: string + description: Docker image tag + required: true + default: "latest-full" jobs: build: @@ -35,7 +41,7 @@ jobs: image: lfoppiano/grobid registry: docker.io pushImage: true - tags: latest-full + tags: latest-full, ${{ github.event.inputs.custom_tag}} dockerfile: Dockerfile.delft - name: Image digest run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/ci-build-unstable.yml b/.github/workflows/ci-build-unstable.yml index 808931f3d7..558933a0e9 100644 --- a/.github/workflows/ci-build-unstable.yml +++ b/.github/workflows/ci-build-unstable.yml @@ -4,7 +4,7 @@ on: [ push ] concurrency: group: gradle - cancel-in-progress: true + cancel-in-progress: false jobs: diff --git a/.gitignore b/.gitignore index 4bcec65bcd..35526b5247 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ Thumbs.db .settings .classpath .idea +.vscode .gradle **/build */out/ diff --git a/CHANGELOG.md b/CHANGELOG.md index e5df60bb81..bbf5ba3459 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,22 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [0.8.2] - TBD + +### Added +- New model specialisation/variants (flavors) mechanism #1151 +- New specialised models for a lightweight processing that covers other type of scientific articles that are not following the general segmentation schema (e.g. corrections, editorial letters, etc.) #1202 +- Additional training data covering edge cases where the Data Availability statements are over multiple pages #1200 +- Added a flag that allow output the raw copyright information in TEI #1181 + +### Changed + +### Fixed +- Fix URL identification for certain edge cases #1190, #1191, #1185 +- Fix fulltext model training data #1107 +- Fix header model training data #1128 +- Updated the docker image's packages to reduce the vulnerabilities #1173 + ## [0.8.1] - 2024-09-14 ### Added diff --git a/Dockerfile.delft b/Dockerfile.delft index ab96ac3e09..d9c0f27f2f 100644 --- a/Dockerfile.delft +++ b/Dockerfile.delft @@ -87,6 +87,7 @@ ENTRYPOINT ["/tini", "-s", "--"] # install JRE, python and other dependencies RUN apt-get update && \ + apt-mark hold libcudnn8 && \ apt-get -y upgrade && \ apt-get -y --no-install-recommends install apt-utils build-essential gcc libxml2 libfontconfig unzip curl \ openjdk-17-jre-headless ca-certificates-java \ @@ -141,7 +142,7 @@ RUN python3 preload_embeddings.py --registry ./resources-registry.json && \ RUN mkdir delft && \ cp ./resources-registry.json delft/ -ENV GROBID_SERVICE_OPTS "--add-opens java.base/java.lang=ALL-UNNAMED" +ENV GROBID_SERVICE_OPTS "--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED" CMD ["./grobid-service/bin/grobid-service"] diff --git a/Readme.md b/Readme.md index 66b4dd6791..989bc5c3ca 100644 --- a/Readme.md +++ b/Readme.md @@ -78,7 +78,11 @@ For facilitating the usage GROBID service at scale, we provide clients written i - Java GROBID client - Node.js GROBID client -All these clients will take advantage of the multi-threading for scaling large set of PDF processing. As a consequence, they will be much more efficient than the [batch command lines](https://grobid.readthedocs.io/en/latest/Grobid-batch/) (which use only one thread) and should be preferred. +A third party client for Go is available offering functionality similar to the Python client: + +- Go GROBID client + +All these clients will take advantage of the multi-threading for scaling large set of PDF processing. As a consequence, they will be much more efficient than the [batch command lines](https://grobid.readthedocs.io/en/latest/Grobid-batch/) (which use only one thread) and should be preferred. For example, we have been able to run the complete full-text processing at around 10.6 PDF per second (around 915,000 PDF per day, around 20M pages per day) with the node.js client listed above during one week on one 16 CPU machine (16 threads, 32GB RAM, no SDD, articles from mainstream publishers), see [here](https://github.com/kermitt2/grobid/issues/443#issuecomment-505208132) (11.3M PDF were processed in 6 days by 2 servers without interruption). diff --git a/doc/Deep-Learning-models.md b/doc/Deep-Learning-models.md index c3db143110..886597af46 100644 --- a/doc/Deep-Learning-models.md +++ b/doc/Deep-Learning-models.md @@ -18,7 +18,7 @@ Current neural models can be up to 50 times slower than CRF, depending on the ar ## Recommended Deep Learning models -By default, only CRF models are used by Grobid. You need to select the Deep Learning models you would like to use in the GROBID configuration yaml file (`grobid/grobid-home/config/grobid.yaml`). See [here](https://grobid.readthedocs.io/en/latest/Configuration/#configuring-the-models) for more details on how to select these models. The most convenient way to use the Deep Learning models is to use the full GROBID Docker image and pass a configuration file at launch of the container describing the selected models to be used instead of the default CRF ones. Note that the full GROBID Docker image is already configured to use Deep Learning models for bibliographical reference and affiliation-address parsing. +By default, only CRF models are used by Grobid. You need to select the Deep Learning models you would like to use in the GROBID configuration yaml file (`grobid/grobid-home/config/grobid.yaml`). See [here](Configuration.md#configuring-the-models) for more details on how to select these models. The most convenient way to use the Deep Learning models is to use the full GROBID Docker image and pass a configuration file at launch of the container describing the selected models to be used instead of the default CRF ones. Note that the full GROBID Docker image is already configured to use Deep Learning models for bibliographical reference and affiliation-address parsing. For current GROBID version 0.8.1, we recommend considering the usage of the following Deep Learning models: @@ -46,7 +46,7 @@ However, if you need a "local" library installation and build, prepare a lot of #### Classic python and Virtualenv -0. Install GROBID as indicated [here](https://grobid.readthedocs.io/en/latest/Install-Grobid/). +0. Install GROBID as indicated [here](Install-Grobid.md). The following was tested with Java version up to 17. @@ -130,7 +130,7 @@ INFO [2020-10-30 23:04:07,756] org.grobid.core.jni.DeLFTModel: Loading DeLFT mo INFO [2020-10-30 23:04:07,758] org.grobid.core.jni.JEPThreadPool: Creating JEP instance for thread 44 ``` -It is then possible to [benchmark end-to-end](https://grobid.readthedocs.io/en/latest/End-to-end-evaluation/) the selected Deep Learning models as any usual GROBID benchmarking exercise. In practice, the CRF models should be mixed with Deep Learning models to keep the process reasonably fast and memory-hungry. In addition, note that, currently, due to the limited amount of training data, Deep Learning models perform significantly better than CRF only for a few models (`citation`, `affiliation-address`, `reference-segmenter`). This should of course certainly change in the future! +It is then possible to [benchmark end-to-end](End-to-end-evaluation.md) the selected Deep Learning models as any usual GROBID benchmarking exercise. In practice, the CRF models should be mixed with Deep Learning models to keep the process reasonably fast and memory-hungry. In addition, note that, currently, due to the limited amount of training data, Deep Learning models perform significantly better than CRF only for a few models (`citation`, `affiliation-address`, `reference-segmenter`). This should of course certainly change in the future! #### Anaconda diff --git a/doc/End-to-end-evaluation.md b/doc/End-to-end-evaluation.md index 24284141c7..4e8505629a 100644 --- a/doc/End-to-end-evaluation.md +++ b/doc/End-to-end-evaluation.md @@ -12,7 +12,7 @@ For actual benchmarks, see the [Benchmarking page](Benchmarking.md). We describe ## Datasets -The corpus used for the end-to-end evaluation of Grobid are all available in a single place on Zenodo: https://zenodo.org/record/7708580. Some of these datasets have been further annotated to make the evaluation of certain sub-structures possible (in particular code and data availability sections & funding sections). +The corpus used for the end-to-end evaluation of Grobid are all available in a single place on Zenodo: [https://zenodo.org/record/7708580](https://zenodo.org/record/7708580). Some of these datasets have been further annotated to make the evaluation of certain sub-structures possible (in particular code and data availability sections & funding sections). These resources are originally published under CC-BY license. Our additional annotations are similarly under CC-BY. We thank NIH, bioRxiv, PLOS and eLife for making these resources Open Access and reusable. diff --git a/doc/Grobid-batch.md b/doc/Grobid-batch.md index d856126eab..563ec60819 100644 --- a/doc/Grobid-batch.md +++ b/doc/Grobid-batch.md @@ -1,6 +1,6 @@

GROBID batch mode

-We do **not** recommend to use the batch mode. For the best performance, benchmarking and for exploiting multithreading, we recommend to use the service mode, see [Use GROBID as a service](Grobid-service.md), and not the batch mode. Clients for GROBID services are provided in [Python](https://github.com/kermitt2/grobid-client-python), [Java](https://github.com/kermitt2/grobid-client-java) and [node.js](https://github.com/kermitt2/grobid-client-node). +We do **not** recommend to use the batch mode. For the best performance, benchmarking and for exploiting multithreading, we recommend to use the service mode, see [Use GROBID as a service](Grobid-service.md), and not the batch mode. Clients for GROBID services are provided in [Python](https://github.com/kermitt2/grobid-client-python), [Java](https://github.com/kermitt2/grobid-client-java), [node.js](https://github.com/kermitt2/grobid-client-node) and [Go](https://github.com/miku/grobidclient). Using the batch mode is only necessary to create pre-annotated training data. If you do not need good runtime and just need to casually process some inputs, the batch mode is available for convenience. diff --git a/doc/Grobid-docker.md b/doc/Grobid-docker.md index 7b447a0cec..8771974a67 100644 --- a/doc/Grobid-docker.md +++ b/doc/Grobid-docker.md @@ -57,7 +57,7 @@ Access the service: - open the browser at the address `http://localhost:8080` - the health check will be accessible at the address `http://localhost:8081` -Grobid web services are then available as described in the [service documentation](https://grobid.readthedocs.io/en/latest/Grobid-service/). +Grobid web services are then available as described in the [service documentation](Grobid-service.md). By default, this image runs Deep Learning models for: @@ -113,7 +113,7 @@ Access the service: - open the browser at the address `http://localhost:8080` - the health check will be accessible at the address `http://localhost:8081` -Grobid web services are then available as described in the [service documentation](https://grobid.readthedocs.io/en/latest/Grobid-service/). +Grobid web services are then available as described in the [service documentation](Grobid-service.md). ## Configure using the yaml config file diff --git a/doc/Grobid-service.md b/doc/Grobid-service.md index 4b3f4129cc..21ced218d9 100644 --- a/doc/Grobid-service.md +++ b/doc/Grobid-service.md @@ -59,7 +59,7 @@ If required, modify the file under `grobid/grobid-home/config/grobid.yaml` for s See the [configuration page](Configuration.md) for details on how to set the different parameters of the `grobid.yaml` configuration file. Service and logging parameters are also set in this configuration file. -If Docker is used, see [here](https://grobid.readthedocs.io/en/latest/Grobid-docker/#configure-using-the-yaml-config-file) on how to start a Grobid container with a modified configuration file. +If Docker is used, see [here](Grobid-docker.md#configure-using-the-yaml-config-file) on how to start a Grobid container with a modified configuration file. ### Model loading strategy You can choose to load all the models at the start of the service or lazily when a model is used the first time, the latter being the default. @@ -178,20 +178,20 @@ curl -v -H "Accept: application/x-bibtex" --form input=@./thefile.pdf localhost: Convert the complete input document into TEI XML format (header, body and bibliographical section). -| method | request type | response type | parameters | requirement | description | -|--- |--- |--- |--------------------------|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| POST, PUT | `multipart/form-data` | `application/xml` | `input` | required | PDF file to be processed | -| | | | `consolidateHeader` | optional | `consolidateHeader` is a string of value `0` (no consolidation), `1` (consolidate and inject all extra metadata, default value), `2` (consolidate the citation and inject DOI only), or `3` (consolidate using only extracted DOI - if extracted). | -| | | | `consolidateCitations` | optional | `consolidateCitations` is a string of value `0` (no consolidation, default value) or `1` (consolidate and inject all extra metadata), or `2` (consolidate the citation and inject DOI only). | -| | | | `consolidatFunders` | optional | `consolidateFunders` is a string of value `0` (no consolidation, default value) or `1` (consolidate and inject all extra metadata), or `2` (consolidate the funder and inject DOI only). | -| | | | `includeRawCitations` | optional | `includeRawCitations` is a boolean value, `0` (default, do not include raw reference string in the result) or `1` (include raw reference string in the result). | -| | | | `includeRawAffiliations` | optional | `includeRawAffiliations` is a boolean value, `0` (default, do not include raw affiliation string in the result) or `1` (include raw affiliation string in the result). | -| | | | `includeRawCopyrights` | optional | `includeRawCopyrights` is a boolean value, `0` (default, do not include raw copyrights/license string in the result) or `1` (include raw copyrights/license string in the result). | -| | | | `teiCoordinates` | optional | list of element names for which coordinates in the PDF document have to be added, see [Coordinates of structures in the original PDF](Coordinates-in-PDF.md) for more details | -| | | | `segmentSentences` | optional | Paragraphs structures in the resulting TEI will be further segmented into sentence elements | -| | | | `generateIds` | optional | if supplied as a string equal to `1`, it generates uniqe identifiers for each text component | -| | | | `start` | optional | Start page number of the PDF to be considered, previous pages will be skipped/ignored, integer with first page starting at `1`, (default `-1`, start from the first page of the PDF) | -| | | | `end` | optional | End page number of the PDF to be considered, next pages will be skipped/ignored, integer with first page starting at `1` (default `-1`, end with the last page of the PDF) | +| method | request type | response type | parameters | requirement | description | +|--- |--- |--- |--------------------------|-----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| POST, PUT | `multipart/form-data` | `application/xml` | `input` | required | PDF file to be processed | +| | | | `consolidateHeader` | optional | `consolidateHeader` is a string of value `0` (no consolidation), `1` (consolidate and inject all extra metadata, default value), `2` (consolidate the citation and inject DOI only), or `3` (consolidate using only extracted DOI - if extracted). | +| | | | `consolidateCitations` | optional | `consolidateCitations` is a string of value `0` (no consolidation, default value) or `1` (consolidate and inject all extra metadata), or `2` (consolidate the citation and inject DOI only). | +| | | | `consolidatFunders` | optional | `consolidateFunders` is a string of value `0` (no consolidation, default value) or `1` (consolidate and inject all extra metadata), or `2` (consolidate the funder and inject DOI only). | +| | | | `includeRawCitations` | optional | `includeRawCitations` is a boolean value, `0` (default, do not include raw reference string in the result) or `1` (include raw reference string in the result). | +| | | | `includeRawAffiliations` | optional | `includeRawAffiliations` is a boolean value, `0` (default, do not include raw affiliation string in the result) or `1` (include raw affiliation string in the result). | +| | | | `includeRawCopyrights` | optional | `includeRawCopyrights` is a boolean value, `0` (default, do not include raw copyrights/license string in the result) or `1` (include raw copyrights/license string in the result). | +| | | | `teiCoordinates` | optional | list of element names for which coordinates in the PDF document have to be added, see [Coordinates of structures in the original PDF](Coordinates-in-PDF.md) for more details | +| | | | `segmentSentences` | optional | Paragraphs structures in the resulting TEI will be further segmented into sentence elements | +| | | | `generateIDs` | optional | if supplied as a string equal to `1`, it generates uniqe identifiers for each text component | +| | | | `start` | optional | Start page number of the PDF to be considered, previous pages will be skipped/ignored, integer with first page starting at `1`, (default `-1`, start from the first page of the PDF) | +| | | | `end` | optional | End page number of the PDF to be considered, next pages will be skipped/ignored, integer with first page starting at `1` (default `-1`, end with the last page of the PDF) | Response status codes: diff --git a/doc/Principles.md b/doc/Principles.md index 0f42d353b6..d0626b78c3 100644 --- a/doc/Principles.md +++ b/doc/Principles.md @@ -12,7 +12,7 @@ In large scale scientific document ingestion tasks, the large majority of docume To process publisher XML, complementary to GROBID, we built [Pub2TEI](https://github.com/kermitt2/Pub2TEI), a collection of style sheets developed over 11 years able to transform a variety of publisher XML formats to the same TEI XML format as produced by GROBID. This common format, which supersedes a dozen of publisher formats and many of their flavors, can centralize further any processing across PDF and heterogeneous XML sources without information loss, and support various applications (see __Fig. 1__). Similarly, LaTeX sources (typically all available arXiv sources) can be processed with our fork of [LaTeXML](https://github.com/kermitt2/LaTeXML) to produce a TEI representation compatible with GROBID and Pub2TEI output, without information loss from LaTeXML XML. -The rest of this page gives an overview of the main GROBID design principles. Skip it if you are not interested in the technical details. Functionalities are described in the [User Manual](https://grobid.readthedocs.io/en/latest/). Recent benchmarking are available [here](https://grobid.readthedocs.io/en/latest/Benchmarking/). +The rest of this page gives an overview of the main GROBID design principles. Skip it if you are not interested in the technical details. Functionalities are described in the [User Manual](index.md). Recent benchmarking are available [here](Benchmarking.md). ## Document parsing as a cascade of sequence labeling models @@ -79,13 +79,13 @@ GROBID does not use a vast amount of training data derived from existing publish - A lower amount of training data can keep models smaller (e.g. with CRF), faster to train and thus easier for setting hyperparameters. -In practice, the size of GROBID training data is smaller than the ones of CERMINE _(Tkaczyk et al., 2015)_ by a factor 30 to 100, and smaller than ScienceParse 2 by a factor 2500 to 10000. Still GROBID provides comparable or better accuracy scores. To help to ensure high-quality training data, we develop detailed [annotation guidelines](training/General-principles/) to remove as much as possible disagreements/inconsistencies regarding the annotation decision. The training data is reviewed regularly. We do not use double-blind annotation with reconciliation and do not compute Inter Annotator Agreement (as we should), because the average size of the annotation team is under 2 :) +In practice, the size of GROBID training data is smaller than the ones of CERMINE _(Tkaczyk et al., 2015)_ by a factor 30 to 100, and smaller than ScienceParse 2 by a factor 2500 to 10000. Still GROBID provides comparable or better accuracy scores. To help to ensure high-quality training data, we develop detailed [annotation guidelines](training/General-principles.md) to remove as much as possible disagreements/inconsistencies regarding the annotation decision. The training data is reviewed regularly. We do not use double-blind annotation with reconciliation and do not compute Inter Annotator Agreement (as we should), because the average size of the annotation team is under 2 :) ## Evaluation As the training data is crafted for accuracy and coverage, training data is strongly biased by undersampling non-edge cases. Or to rephrase it maybe more clearly: the less "informative" training examples, which are the most common ones, are less represented in our training data. Because of this bias, our manually labeled data cannot be used for evaluation. Evaluations of GROBID models are thus done with separated and stable holdout sets from publishers, which follow more realistic distributions of document variations. -See the current evaluations with [PubMed Central holdout set](https://grobid.readthedocs.io/en/latest/Benchmarking-pmc/) (1,943 documents, 90,125 bibliographical references in 139,835 citation contexts), [bioarXiv holdout set](https://grobid.readthedocs.io/en/latest/Benchmarking-biorxiv/) (2,000 documents, 98,753 bibliographical references in 142,796 citation contexts), [eLife holdout set](https://grobid.readthedocs.io/en/latest/Benchmarking-elife/) (984 documents, 63,664 bibliographical references in 109,022 reference contexts) and [PLOS holdout set](https://grobid.readthedocs.io/en/latest/Benchmarking-plos/) (1,000 documents, 48,449 bibliographical references in 69,755 reference contexts). +See the current evaluations with [PubMed Central holdout set](Benchmarking-pmc.md) (1,943 documents, 90,125 bibliographical references in 139,835 citation contexts), [bioarXiv holdout set](Benchmarking-biorxiv.md) (2,000 documents, 98,753 bibliographical references in 142,796 citation contexts), [eLife holdout set](Benchmarking-elife.md) (984 documents, 63,664 bibliographical references in 109,022 reference contexts) and [PLOS holdout set](Benchmarking-plos.md) (1,000 documents, 48,449 bibliographical references in 69,755 reference contexts). Our evaluation approach, however, raises two main issues: diff --git a/doc/Troubleshooting.md b/doc/Troubleshooting.md index a407965196..a772a54bdd 100644 --- a/doc/Troubleshooting.md +++ b/doc/Troubleshooting.md @@ -19,7 +19,7 @@ Following are the configuration used to process with `processFulltextDocument` a - in the query, consolidateHeader can be `1` or `2` if you are using the consolidation. It significantly improves the accuracy and add useful metadata. -- ff you want to consolidate all the bibliographical references and use `consolidateCitations` as `1` or `2`, CrossRef query rate limit will avoid scaling to more than 1 document per second (and likely less in practice)... For scaling the bibliographical reference resolution, you will need to use a [local consolidation service](https://github.com/kermitt2/biblio-glutton). The overall capacity will depend on the biblio-glutton service then, and the number of elasticsearch nodes you can exploit. From experience, it is difficult to go beyond 300K PDF per day when using consolidation for every extracted bibliographical references with one biblio-glutton instance. +- if you want to consolidate all the bibliographical references and use `consolidateCitations` as `1` or `2`, CrossRef query rate limit will avoid scaling to more than 1 document per second (and likely less in practice)... For scaling the bibliographical reference resolution, you will need to use a [local consolidation service](https://github.com/kermitt2/biblio-glutton). The overall capacity will depend on the biblio-glutton service then, and the number of elasticsearch nodes you can exploit. From experience, it is difficult to go beyond 300K PDF per day when using consolidation for every extracted bibliographical references with one biblio-glutton instance. See [full thread](https://github.com/kermitt2/grobid/issues/443). diff --git a/doc/css/custom.css b/doc/css/custom.css new file mode 100644 index 0000000000..3e7617d434 --- /dev/null +++ b/doc/css/custom.css @@ -0,0 +1,3 @@ +.wy-table-responsive table td, .wy-table-responsive table th { + white-space: inherit; +} \ No newline at end of file diff --git a/doc/training/General-principles.md b/doc/training/General-principles.md index dd40f3e3cc..837db61c68 100644 --- a/doc/training/General-principles.md +++ b/doc/training/General-principles.md @@ -8,7 +8,7 @@ This maybe of interest if the current state of the models does not correctly rec The addition of training in Grobid is __not__ done from scratch, but from pre-annotated training data generated by the existing models in Grobid. This ensures that the syntax of the new training data will be (normally) correct and that the stream of text will be easy to align with the text extracted from the PDF. This permits also to take advantage of the existing models which will annotate correctly a certain amount of text, and to focus on the corrections, thus improving the productivity of the annotator. -For generating pre-annotated training files for Grobid based on the existing models, see the instructions for running the software in batch [here](../../Training-the-models-of-Grobid/#generation-of-training-data) and [here](../../Grobid-batch/#createtraining). +For generating pre-annotated training files for Grobid based on the existing models, see the instructions for running the software in batch [here](../Training-the-models-of-Grobid.md#generation-of-training-data) and [here](../Grobid-batch.md#createtraining). After running the batch `createTraining` on a set of PDF files using methods for creating training data, each article comes with: @@ -35,7 +35,7 @@ The exact list of generated files depends on the structures occurring in the art | `*.training.references.authors.tei.xml` | citation | for all the authors appearing in the bibliographical references of the article | -These files must be reviewed and corrected manually before being added to the training data, taking into account that exploiting any additional training data requires GROBID to re-create its models - by [retraining](../Training-the-models-of-Grobid) them. +These files must be reviewed and corrected manually before being added to the training data, taking into account that exploiting any additional training data requires GROBID to re-create its models - by [retraining](../Training-the-models-of-Grobid.md) them. ## Correcting pre-annotated files diff --git a/doc/training/fulltext.md b/doc/training/fulltext.md index 870b1d7234..5cf8b3466d 100644 --- a/doc/training/fulltext.md +++ b/doc/training/fulltext.md @@ -67,7 +67,7 @@ Paragraphs constitute the main bulk of most typical articles or publications and

``` -> Note: The `` (line break) elements are there because they have been recognized as such in the PDF in the text flow. However the fact that they are located within or outside a tagged paragraph or section title has no impact. Just be sure NOT to modify the order of the text flow and `` as mentionned [here](General-principles/#correcting-pre-annotated-files). +> Note: The `` (line break) elements are there because they have been recognized as such in the PDF in the text flow. However, the fact that they are located within or outside a tagged paragraph or section title has no impact. Just be sure NOT to modify the order of the text flow and `` as mentionned [here](General-principles.md#correcting-pre-annotated-files). Following the TEI, formulas should be on the same hierarchical level as paragraphs, and not be contained inside paragraphs: diff --git a/doc/training/header.md b/doc/training/header.md index f87e5d6c6e..6754d7856a 100644 --- a/doc/training/header.md +++ b/doc/training/header.md @@ -2,7 +2,7 @@ ## Introduction -For the following guidelines, it is expected that training data has been generated as explained [here](../Training-the-models-of-Grobid/#generation-of-training-data). +For the following guidelines, it is expected that training data has been generated as explained [here](../Training-the-models-of-Grobid.md#generation-of-training-data). In Grobid, the document "header" corresponds to the bibliographical/metadata information sections about the document. This is typically all the information at the beginning of the article (often called the "front", title, authors, publication information, affiliations, abstrac, keywords, correspondence information, submission information, etc.), before the start of the document body (e.g. typically before the introduction section), but not only. Some of these elements can be located in the footnotes of the first page (e.g. affiliation of the authors), or at the end of the article (full list of authors, detailed affiliation and contact, how to cite, copyrights/licence and Open Access information). diff --git a/doc/training/segmentation.md b/doc/training/segmentation.md index 55053c81a7..6c174b9ff2 100644 --- a/doc/training/segmentation.md +++ b/doc/training/segmentation.md @@ -2,7 +2,7 @@ ## Introduction -For the following guidelines, it is expected that training data has been generated as explained [here](../Training-the-models-of-Grobid/#generation-of-training-data). +For the following guidelines, it is expected that training data has been generated as explained [here](../Training-the-models-of-Grobid.md#generation-of-training-data). The following TEI elements are used by the segmentation model: @@ -91,7 +91,7 @@ survival ``` -> Note: In general, whether the `` (line break) element is inside or outside the `` or other elements is of no importance. However as indicated [here](General-principles/#correcting-pre-annotated-files), the element should not be removed and should follow the stream of text. +> Note: In general, whether the `` (line break) element is inside or outside the `` or other elements is of no importance. However as indicated [here](General-principles.md#correcting-pre-annotated-files), the element should not be removed and should follow the stream of text. The following screenshot shows an example where an article starts mid-page, the end of the preceding one occupying the upper first third of the page. As this content does not belong to the article in question, don't add any elements and remove any `` or `` elements that could appear in the preceding article. diff --git a/grobid-core/src/main/java/org/grobid/core/document/OPSService.java b/grobid-core/src/main/java/org/grobid/core/document/OPSService.java index 1cd78d164d..7b2cf22763 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/OPSService.java +++ b/grobid-core/src/main/java/org/grobid/core/document/OPSService.java @@ -116,8 +116,10 @@ public String descriptionRetrieval(String patentNumber) throws IOException, spf.setValidating(false); spf.setFeature("http://xml.org/sax/features/namespaces", false); spf.setFeature("http://xml.org/sax/features/validation", false); + spf.setFeature("http://xml.org/sax/features/external-general-entities", false); + spf.setFeature("http://xml.org/sax/features/external-parameter-entities", false); //get a new instance of parser - XMLReader reader = XMLReaderFactory.createXMLReader(); + XMLReader reader = spf.newSAXParser().getXMLReader(); reader.setEntityResolver(new EntityResolver() { public InputSource resolveEntity(String publicId, String systemId) { return new InputSource( diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 3545acb61f..d9f2c46006 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -297,7 +297,7 @@ public StringBuilder toTEIHeader(BiblioItem biblio, if (config.getIncludeRawCopyrights() && biblio.getCopyright() != null && biblio.getCopyright().length()>0) { tei.append("\t\t\t\t\t

"); tei.append(TextUtilities.HTMLEncode(biblio.getCopyright())); - tei.append("\n"); + tei.append("

\n"); } tei.append("\t\t\t\t\n"); @@ -315,7 +315,7 @@ public StringBuilder toTEIHeader(BiblioItem biblio, if (config.getIncludeRawCopyrights() && biblio.getCopyright() != null && biblio.getCopyright().length()>0) { tei.append("\t\t\t\t\t

"); tei.append(TextUtilities.HTMLEncode(biblio.getCopyright())); - tei.append("\n"); + tei.append("

\n"); } tei.append("\t\t\t\t\n"); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java b/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java index 95798c36c9..1e97e411a1 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java @@ -1,21 +1,22 @@ package org.grobid.core.engines; -import org.chasen.crfpp.Tagger; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.grobid.core.GrobidModel; import org.grobid.core.GrobidModels; import org.grobid.core.data.Affiliation; +import org.grobid.core.engines.label.TaggingLabel; +import org.grobid.core.engines.label.TaggingLabels; import org.grobid.core.exceptions.GrobidException; import org.grobid.core.features.FeaturesVectorAffiliationAddress; import org.grobid.core.layout.LayoutToken; import org.grobid.core.lexicon.Lexicon; +import org.grobid.core.tokenization.TaggingTokenCluster; +import org.grobid.core.tokenization.TaggingTokenClusteror; +import org.grobid.core.utilities.LayoutTokensUtil; import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.TextUtilities; import org.grobid.core.utilities.UnicodeUtil; -import org.grobid.core.utilities.LayoutTokensUtil; -import org.grobid.core.engines.tagging.GenericTaggerUtils; -import org.grobid.core.tokenization.TaggingTokenCluster; -import org.grobid.core.tokenization.TaggingTokenClusteror; -import org.grobid.core.engines.label.TaggingLabel; -import org.grobid.core.engines.label.TaggingLabels; import java.util.ArrayList; import java.util.List; @@ -24,8 +25,12 @@ public class AffiliationAddressParser extends AbstractParser { public Lexicon lexicon = Lexicon.getInstance(); + protected AffiliationAddressParser(GrobidModel model) { + super(model); + } + public AffiliationAddressParser() { - super(GrobidModels.AFFILIATION_ADDRESS); + this(GrobidModels.AFFILIATION_ADDRESS); } public List processing(String input) { @@ -78,22 +83,26 @@ protected static List getAffiliationBlocks(List tokenizatio return affiliationBlocks; } + /** + * Separate affiliation blocks, when they appears to be in separate set of offsets. + */ protected static List getAffiliationBlocksFromSegments(List> tokenizations) { - ArrayList affiliationBlocks = new ArrayList(); + ArrayList affiliationBlocks = new ArrayList<>(); int end = 0; for(List tokenizationSegment : tokenizations) { - if (tokenizationSegment == null || tokenizationSegment.size() == 0) + if (CollectionUtils.isEmpty(tokenizationSegment)) continue; // if we have an offset shit, we introduce a segmentation of the affiliation block LayoutToken startToken = tokenizationSegment.get(0); int start = startToken.getOffset(); - if (start-end > 2) + if (start-end > 2 && end > 0) affiliationBlocks.add("\n"); for(LayoutToken tok : tokenizationSegment) { - if (tok.getText().length() == 0) + if (StringUtils.isEmpty(tok.getText())) { continue; + } if (!tok.getText().equals(" ")) { if (tok.getText().equals("\n")) { @@ -123,11 +132,11 @@ public List processingLayoutTokens(List> tokeniza //System.out.println(affiliationBlocks.toString()); - List> placesPositions = new ArrayList>(); - List> countriesPositions = new ArrayList>(); + List> placesPositions = new ArrayList<>(); + List> countriesPositions = new ArrayList<>(); placesPositions.add(lexicon.tokenPositionsLocationNames(tokenizationsAffiliation)); countriesPositions.add(lexicon.tokenPositionsCountryNames(tokenizationsAffiliation)); - List> allTokens = new ArrayList>(); + List> allTokens = new ArrayList<>(); allTokens.add(tokenizationsAffiliation); String affiliationSequenceWithFeatures = FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index d2b1bf8661..dfc623a7c2 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -3023,7 +3023,7 @@ private StringBuilder getSectionAsTEI(String xmlType, StringBuilder output = new StringBuilder(); SortedSet sectionPart = doc.getDocumentPart(taggingLabel); - if (sectionPart != null && sectionPart.size() > 0) { + if (CollectionUtils.isNotEmpty(sectionPart)) { Pair sectionTokenisation = getBodyTextFeatured(doc, sectionPart); if (sectionTokenisation != null) { // if featSeg is null, it usually means that no body segment is found in the diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java index 0bd8af981e..1b017c8a1f 100755 --- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java +++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java @@ -16,6 +16,7 @@ import java.util.Set; import java.util.StringTokenizer; import java.util.regex.*; +import java.util.stream.Collectors; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; @@ -36,6 +37,7 @@ import org.grobid.core.utilities.Utilities; import org.grobid.core.utilities.TextUtilities; import org.grobid.core.analyzers.GrobidAnalyzer; +import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -43,10 +45,9 @@ /** * Class for managing all the lexical resources. - * */ public class Lexicon { - private static final Logger LOGGER = LoggerFactory.getLogger(Lexicon.class); + private static final Logger LOGGER = LoggerFactory.getLogger(Lexicon.class); // private static volatile Boolean instanceController = false; private static volatile Lexicon instance; @@ -59,7 +60,7 @@ public class Lexicon { private Set countries = null; // retrieve basic naming information about a research infrastructure (key must be lower case!) - private Map > researchOrganizations = null; + private Map> researchOrganizations = null; // fast matchers for efficient and flexible pattern matching in layout token sequence or strings private FastMatcher abbrevJournalPattern = null; @@ -67,21 +68,21 @@ public class Lexicon { private FastMatcher publisherPattern = null; private FastMatcher journalPattern = null; private FastMatcher cityPattern = null; - private FastMatcher organisationPattern = null; + private FastMatcher organisationPattern = null; private FastMatcher researchInfrastructurePattern = null; - private FastMatcher locationPattern = null; + private FastMatcher locationPattern = null; private FastMatcher countryPattern = null; - private FastMatcher orgFormPattern = null; + private FastMatcher orgFormPattern = null; private FastMatcher collaborationPattern = null; private FastMatcher funderPattern = null; private FastMatcher personTitlePattern = null; - private FastMatcher personSuffixPattern = null; + private FastMatcher personSuffixPattern = null; public static Lexicon getInstance() { if (instance == null) { synchronized (Lexicon.class) { if (instance == null) { - getNewInstance(); + getNewInstance(); } } } @@ -91,11 +92,11 @@ public static Lexicon getInstance() { /** * Creates a new instance. */ - private static synchronized void getNewInstance() { - LOGGER.debug("Get new instance of Lexicon"); - GrobidProperties.getInstance(); - instance = new Lexicon(); - } + private static synchronized void getNewInstance() { + LOGGER.debug("Get new instance of Lexicon"); + GrobidProperties.getInstance(); + instance = new Lexicon(); + } /** * Hidden constructor @@ -103,24 +104,24 @@ private static synchronized void getNewInstance() { private Lexicon() { initDictionary(); initNames(); - // the loading of the journal and conference names is lazy + // the loading of the journal and conference names is lazy addDictionary(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"wordforms"+File.separator+"english.wf", Language.EN); + "lexicon" + File.separator + "wordforms" + File.separator + "english.wf", Language.EN); addDictionary(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"wordforms"+File.separator+"german.wf", Language.EN); + "lexicon" + File.separator + "wordforms" + File.separator + "german.wf", Language.EN); + addLastNames(GrobidProperties.getGrobidHomePath() + File.separator + + "lexicon" + File.separator + "names" + File.separator + "names.family"); addLastNames(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"names"+File.separator+"names.family"); - addLastNames(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"names"+File.separator+"lastname.5k"); + "lexicon" + File.separator + "names" + File.separator + "lastname.5k"); + addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + + "lexicon" + File.separator + "names" + File.separator + "names.female"); addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"names"+File.separator+"names.female"); + "lexicon" + File.separator + "names" + File.separator + "names.male"); addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"names"+File.separator+"names.male"); - addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"names"+File.separator+"firstname.5k"); + "lexicon" + File.separator + "names" + File.separator + "firstname.5k"); initCountryCodes(); addCountryCodes(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"countries"+File.separator+"CountryCodes.xml"); + "lexicon" + File.separator + "countries" + File.separator + "CountryCodes.xml"); } /** @@ -139,7 +140,7 @@ public OrganizationRecord(String name, String fullName, String lang) { } private void initDictionary() { - LOGGER.info("Initiating dictionary"); + LOGGER.info("Initiating dictionary"); dictionary_en = new HashSet<>(); dictionary_de = new HashSet<>(); LOGGER.info("End of Initialization of dictionary"); @@ -149,11 +150,11 @@ public final void addDictionary(String path, String lang) { File file = new File(path); if (!file.exists()) { throw new GrobidResourceException("Cannot add entries to dictionary (language '" + lang + - "'), because file '" + file.getAbsolutePath() + "' does not exists."); + "'), because file '" + file.getAbsolutePath() + "' does not exists."); } if (!file.canRead()) { throw new GrobidResourceException("Cannot add entries to dictionary (language '" + lang + - "'), because cannot read file '" + file.getAbsolutePath() + "'."); + "'), because cannot read file '" + file.getAbsolutePath() + "'."); } InputStream ist = null; InputStreamReader isr = null; @@ -202,14 +203,14 @@ public boolean isCountry(String tok) { } private void initNames() { - LOGGER.info("Initiating names"); + LOGGER.info("Initiating names"); firstNames = new HashSet(); lastNames = new HashSet(); LOGGER.info("End of initialization of names"); } private void initCountryCodes() { - LOGGER.info("Initiating country codes"); + LOGGER.info("Initiating country codes"); countryCodes = new HashMap(); countries = new HashSet(); countryPattern = new FastMatcher(); @@ -220,11 +221,11 @@ private void addCountryCodes(String path) { File file = new File(path); if (!file.exists()) { throw new GrobidResourceException("Cannot add country codes to dictionary, because file '" + - file.getAbsolutePath() + "' does not exists."); + file.getAbsolutePath() + "' does not exists."); } if (!file.canRead()) { throw new GrobidResourceException("Cannot add country codes to dictionary, because cannot read file '" + - file.getAbsolutePath() + "'."); + file.getAbsolutePath() + "'."); } InputStream ist = null; //InputStreamReader isr = null; @@ -262,7 +263,7 @@ public void initCountryPatterns() { if (countries == null || countries.size() == 0) { // it should never be the case addCountryCodes(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"countries"+File.separator+"CountryCodes.xml"); + "lexicon" + File.separator + "countries" + File.separator + "CountryCodes.xml"); } for (String country : countries) { @@ -274,11 +275,11 @@ public final void addFirstNames(String path) { File file = new File(path); if (!file.exists()) { throw new GrobidResourceException("Cannot add first names to dictionary, because file '" + - file.getAbsolutePath() + "' does not exists."); + file.getAbsolutePath() + "' does not exists."); } if (!file.canRead()) { throw new GrobidResourceException("Cannot add first names to dictionary, because cannot read file '" + - file.getAbsolutePath() + "'."); + file.getAbsolutePath() + "'."); } InputStream ist = null; BufferedReader dis = null; @@ -318,11 +319,11 @@ public final void addLastNames(String path) { File file = new File(path); if (!file.exists()) { throw new GrobidResourceException("Cannot add last names to dictionary, because file '" + - file.getAbsolutePath() + "' does not exists."); + file.getAbsolutePath() + "' does not exists."); } if (!file.canRead()) { throw new GrobidResourceException("Cannot add last names to dictionary, because cannot read file '" + - file.getAbsolutePath() + "'."); + file.getAbsolutePath() + "'."); } InputStream ist = null; BufferedReader dis = null; @@ -360,6 +361,7 @@ public final void addLastNames(String path) { /** * Lexical look-up, default is English + * * @param s a string to test * @return true if in the dictionary */ @@ -415,13 +417,13 @@ public boolean inDictionary(String s, String lang) { public void initJournals() { try { abbrevJournalPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/journals/abbrev_journals.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/journals/abbrev_journals.txt")); journalPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/journals/journals.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/journals/journals.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException( - "Error when compiling lexicon matcher for abbreviated journal names.", e); + "Error when compiling lexicon matcher for abbreviated journal names.", e); } } @@ -429,7 +431,7 @@ public void initConferences() { // ArrayList conferences = new ArrayList(); try { conferencePattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/journals/proceedings.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/journals/proceedings.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for conference names.", e); } @@ -438,7 +440,7 @@ public void initConferences() { public void initPublishers() { try { publisherPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/publishers/publishers.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/publishers/publishers.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for conference names.", e); } @@ -447,7 +449,7 @@ public void initPublishers() { public void initCities() { try { cityPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/places/cities15000.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/places/cities15000.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for cities.", e); } @@ -458,56 +460,56 @@ public void initCollaborations() { //collaborationPattern = new FastMatcher(new // File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/collaborations.txt")); collaborationPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/inspire_collaborations.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/inspire_collaborations.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for collaborations.", e); } } - public void initOrganisations() { + public void initOrganisations() { try { organisationPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/WikiOrganizations.lst")); - organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + - "/lexicon/organisations/government.government_agency")); - organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + - "/lexicon/organisations/known_corporations.lst")); - organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + - "/lexicon/organisations/venture_capital.venture_funded_company")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/WikiOrganizations.lst")); + organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + + "/lexicon/organisations/government.government_agency")); + organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + + "/lexicon/organisations/known_corporations.lst")); + organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + + "/lexicon/organisations/venture_capital.venture_funded_company")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for organisations.", e); } catch (IOException e) { throw new GrobidResourceException("Cannot add term to matcher, because the lexicon resource file " + - "does not exist or cannot be read.", e); + "does not exist or cannot be read.", e); } catch (Exception e) { - throw new GrobidException("An exception occured while running Grobid Lexicon init.", e); - } + throw new GrobidException("An exception occured while running Grobid Lexicon init.", e); + } } - public void initOrgForms() { + public void initOrgForms() { try { - orgFormPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/orgClosings.txt")); + orgFormPattern = new FastMatcher(new + File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/orgClosings.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for organisations.", e); } catch (Exception e) { - throw new GrobidException("An exception occured while running Grobid Lexicon init.", e); - } + throw new GrobidException("An exception occured while running Grobid Lexicon init.", e); + } } - public void initLocations() { + public void initLocations() { try { locationPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/places/location.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/places/location.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for locations.", e); } } - public void initPersonTitles() { + public void initPersonTitles() { try { personTitlePattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/names/VincentNgPeopleTitles.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/names/VincentNgPeopleTitles.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for person titles.", e); } @@ -516,7 +518,7 @@ public void initPersonTitles() { public void initPersonSuffix() { try { personSuffixPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/names/suffix.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/names/suffix.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for person name suffix.", e); } @@ -525,8 +527,8 @@ public void initPersonSuffix() { public void initFunders() { try { funderPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/funders.txt"), - GrobidAnalyzer.getInstance(), true); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/funders.txt"), + GrobidAnalyzer.getInstance(), true); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for funders.", e); } catch (Exception e) { @@ -537,19 +539,19 @@ public void initFunders() { public void initResearchInfrastructures() { try { researchInfrastructurePattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/research_infrastructures.txt"), - GrobidAnalyzer.getInstance(), true); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/research_infrastructures.txt"), + GrobidAnalyzer.getInstance(), true); // store some name mapping researchOrganizations = new TreeMap<>(); File file = new File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/research_infrastructures_map.txt"); if (!file.exists()) { throw new GrobidResourceException("Cannot add research infrastructure names to dictionary, because file '" + - file.getAbsolutePath() + "' does not exists."); + file.getAbsolutePath() + "' does not exists."); } if (!file.canRead()) { throw new GrobidResourceException("Cannot add research infrastructure to dictionary, because cannot read file '" + - file.getAbsolutePath() + "'."); + file.getAbsolutePath() + "'."); } InputStream ist = null; BufferedReader dis = null; @@ -651,7 +653,7 @@ public List getOrganizationNamingInfo(String name) { /** * Map the language codes used by the language identifier component to the normal * language name. - * + *

* Note: due to an older bug, kr is currently map to Korean too - this should * disappear at some point in the future after retraining of models * @@ -847,7 +849,7 @@ public List tokenPositionsCityNames(List s) { /** Organisation names **/ - /** + /** * Soft look-up in organisation name gazetteer for a given string with token positions */ public List tokenPositionsOrganisationNames(String s) { @@ -913,7 +915,7 @@ public List charPositionsOrganisationNames(List s) return results; } - /** + /** * Soft look-up in organisation form name gazetteer for a given string with token positions */ public List tokenPositionsOrgForm(String s) { @@ -992,7 +994,7 @@ public List tokenPositionsLocationNames(List s) { /** * Soft look-up in location name gazetteer for a string, return a list of positions referring * to the character positions within the string. - * + *

* For example "The car is in Milan" as Milan is a location, would return OffsetPosition(14,19) * * @param s the input string @@ -1009,7 +1011,7 @@ public List charPositionsLocationNames(String s) { /** * Soft look-up in location name gazetteer for a list of LayoutToken, return a list of * positions referring to the character positions in the input sequence. - * + *

* For example "The car is in Milan" as Milan is a location, would return OffsetPosition(14,19) * * @param s the input list of LayoutToken @@ -1023,7 +1025,7 @@ public List charPositionsLocationNames(List s) { return results; } - /** + /** * Soft look-up in person title gazetteer for a given string with token positions */ public List tokenPositionsPersonTitle(String s) { @@ -1174,8 +1176,8 @@ public static List tokenPositionsUrlPattern(List to */ public static List characterPositionsUrlPattern(List tokens) { String text = LayoutTokensUtil.toText(tokens); - List textResult = new ArrayList(); - Matcher urlMatcher = TextUtilities.urlPattern.matcher(text); + List textResult = new ArrayList<>(); + Matcher urlMatcher = TextUtilities.urlPattern1.matcher(text); while (urlMatcher.find()) { textResult.add(new OffsetPosition(urlMatcher.start(), urlMatcher.end())); } @@ -1185,7 +1187,7 @@ public static List characterPositionsUrlPattern(List * This will produce better quality recognized URL, avoiding missing suffixes and problems * with break lines and spaces. **/ @@ -1226,8 +1228,8 @@ public static OffsetPosition getTokenPositions(int startPos, int endPos, List urlTokens = new ArrayList<>(); int tokenPos = 0; int tokenIndex = 0; - for(LayoutToken localToken : layoutTokens) { - if (startPos <= tokenPos && (tokenPos+localToken.getText().length() <= endPos) ) { + for (LayoutToken localToken : layoutTokens) { + if (startPos <= tokenPos && (tokenPos + localToken.getText().length() <= endPos)) { urlTokens.add(localToken); if (startTokenIndex == -1) startTokenIndex = tokenIndex; @@ -1249,14 +1251,14 @@ public static OffsetPosition getTokenPositions(int startPos, int endPos, List characterPositionsUrlPatternWithPdfAnnotations( - List layoutTokens, - List pdfAnnotations) { + List layoutTokens, + List pdfAnnotations) { List urlPositions = Lexicon.characterPositionsUrlPattern(layoutTokens); List resultPositions = new ArrayList<>(); // Do we need to extend the url position based on additional position of the corresponding // PDF annotation? - for(OffsetPosition urlPosition : urlPositions) { + for (OffsetPosition urlPosition : urlPositions) { int startPos = urlPosition.start; int endPos = urlPosition.end; @@ -1272,7 +1274,7 @@ public static List characterPositionsUrlPatternWithPdfAnnotation continue; } - List urlTokens = new ArrayList<>(layoutTokens.subList(startTokenIndex, endTokensIndex+1)); + List urlTokens = new ArrayList<>(layoutTokens.subList(startTokenIndex, endTokensIndex + 1)); String urlString = LayoutTokensUtil.toText(urlTokens); @@ -1282,11 +1284,8 @@ public static List characterPositionsUrlPatternWithPdfAnnotation if (CollectionUtils.isNotEmpty(urlTokens)) { LayoutToken lastToken = urlTokens.get(urlTokens.size() - 1); if (pdfAnnotations != null) { - targetAnnotation = pdfAnnotations.stream() - .filter(pdfAnnotation -> - pdfAnnotation.getType() != null && pdfAnnotation.getType() == PDFAnnotation.Type.URI && pdfAnnotation.cover(lastToken)) - .findFirst() - .orElse(null); + targetAnnotation = matchPdfAnnotationsBasedOnCoordinatesDestinationOrLastTokens(pdfAnnotations, urlTokens); + correctedLastTokenIndex = urlTokens.size() - 1; // If we cannot match, maybe the regex got some characters too much, e.g. dots, parenthesis,etc.. @@ -1296,14 +1295,10 @@ public static List characterPositionsUrlPatternWithPdfAnnotation String lastTokenText = lastToken.getText(); int index = urlTokens.size() - 1; // The error should be within a few characters, so we stop if the token length is greater than 1 - while(index > 0 && lastTokenText.length() == 1 && !Character.isLetterOrDigit(lastTokenText.charAt(0)) && targetAnnotation==null) { + while (index > 0 && lastTokenText.length() == 1 && !Character.isLetterOrDigit(lastTokenText.charAt(0)) && targetAnnotation == null) { index -= 1; LayoutToken finalLastToken1 = urlTokens.get(index); - targetAnnotation = pdfAnnotations.stream() - .filter(pdfAnnotation -> - pdfAnnotation.getType() != null && pdfAnnotation.getType() == PDFAnnotation.Type.URI && pdfAnnotation.cover(finalLastToken1)) - .findFirst() - .orElse(null); + targetAnnotation = matchPdfAnnotationsBasedOnCoordinatesDestinationOrLastTokens(pdfAnnotations, urlTokens); correctedLastTokenIndex = index; } @@ -1315,7 +1310,13 @@ public static List characterPositionsUrlPatternWithPdfAnnotation String destination = targetAnnotation.getDestination(); int destinationPos = 0; - if (destination.contains(urlString)) { + if (urlString.replaceAll("\\s", "").equals(destination)) { + // Nothing to do here, we ignore the correctedLastTokenIndex because the regex got everything we need + } else if ( + destination.contains(urlString) + || destination.contains(urlString.replaceAll("\\s", "")) + || destination.contains(StringUtils.stripEnd(urlString, "-")) + ) { //In this case the regex did not catch all the URL, so we need to extend it using the // destination URL from the annotation destinationPos = destination.indexOf(urlString) + urlString.length(); @@ -1327,7 +1328,7 @@ public static List characterPositionsUrlPatternWithPdfAnnotation if ("\n".equals(nextToken.getText()) || " ".equals(nextToken.getText()) || - nextToken.getText().length() == 0) { + nextToken.getText().isEmpty()) { endPos += nextToken.getText().length(); additionalSpaces += nextToken.getText().length(); additionalTokens += 1; @@ -1355,8 +1356,6 @@ public static List characterPositionsUrlPatternWithPdfAnnotation endPos -= additionalSpaces; } } - } else if (urlString.replaceAll("\\s", "").equals(destination)) { - // Nothing to do here, we ignore the correctedLastTokenIndex because the regex got everything we need } else if (urlString.contains(destination) || urlString.replaceAll("\\s", "").contains(destination)) { // In this case the regex has catches too much, usually this should be limited to a few characters, // but we cannot know it for sure. Here we first find the difference between the destination and the @@ -1406,6 +1405,63 @@ public static List characterPositionsUrlPatternWithPdfAnnotation return resultPositions; } + @Nullable + private static PDFAnnotation matchPdfAnnotationsBasedOnCoordinatesDestinationOrLastTokens(List pdfAnnotations, List urlTokens) { + LayoutToken lastToken = urlTokens.get(urlTokens.size() - 1); + String urlString = LayoutTokensUtil.toText(urlTokens); + + List possibleTargetAnnotations = pdfAnnotations.stream() + .filter(pdfAnnotation -> + pdfAnnotation.getType() != null + && pdfAnnotation.getType() == PDFAnnotation.Type.URI + && pdfAnnotation.cover(lastToken) + ).collect(Collectors.toList()); + + PDFAnnotation targetAnnotation; + if (possibleTargetAnnotations.size() > 1) { + possibleTargetAnnotations = possibleTargetAnnotations.stream() + .filter(pdfAnnotation -> + pdfAnnotation.getDestination().contains(urlString) + ) + .collect(Collectors.toList()); + + if (possibleTargetAnnotations.size() > 1) { + // If the lastToken is any of ./:_ we should add the token before + int index = urlTokens.size() - 1; + if (urlTokens.size() > 1 && lastToken.getText().matches("[.:_\\-/]")) { + index -= 1; + } + + while (index > 0 && possibleTargetAnnotations.size() > 1) { + final String lastTokenText2 = LayoutTokensUtil.toText(urlTokens.subList(index - 1, urlTokens.size())); + + possibleTargetAnnotations = possibleTargetAnnotations.stream() + .filter(pdfAnnotation -> + pdfAnnotation.getDestination().contains(lastTokenText2) + ) + .collect(Collectors.toList()); + index--; + } + + targetAnnotation = possibleTargetAnnotations.stream() + .findFirst() + .orElse(null); + + } else { + targetAnnotation = possibleTargetAnnotations.stream() + .findFirst() + .orElse(null); + } + + } else { + targetAnnotation = possibleTargetAnnotations.stream() + .findFirst() + .orElse(null); + } + + return targetAnnotation; + } + /** * Identify in tokenized input the positions of an email address pattern with token positions diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java index f0e6cf03af..af45ad6aff 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/TextUtilities.java @@ -76,7 +76,9 @@ public class TextUtilities { static public final Pattern urlPattern0 = Pattern .compile("(?i)(https?|ftp)\\s?:\\s?//\\s?[-A-Z0-9+&@#/%?=~_()|!:,.;]*[-A-Z0-9+&@#/%=~_()|]"); static public final Pattern urlPattern = Pattern - .compile("(?i)(https?|ftp)\\s{0,2}:\\s{0,2}\\/\\/\\s{0,2}[-A-Z0-9+&@#\\/%?=~_()|!:.;]*[-A-Z0-9+&@#\\/%=~_()]"); + .compile("(?i)(https?|ftp)\\s{0,2}:\\s{0,2}//\\s{0,2}[-A-Z0-9+&@#/%?=~_()|!:.;]*[-A-Z0-9+&@#/%=~_()]"); + static public final Pattern urlPattern1 = Pattern + .compile("(?i)(https?|ftp)\\s{0,2}:\\s{0,2}//\\s{0,2}[-A-Z0-9+&@#/%?=~_()|!:.;]*[-A-Z0-9+&@#/%=~_()]|www\\s{0,2}\\.\\s{0,2}[-A-Z0-9+&@#/%?=~_()|!:.;]*[-A-Z0-9+&@#/%=~_()]"); // a regular expression for identifying email pattern in text // TODO: maybe find a better regex (better == more robust, not more "standard") diff --git a/grobid-core/src/test/java/org/grobid/core/engines/AffiliationAddressParserTest.java b/grobid-core/src/test/java/org/grobid/core/engines/AffiliationAddressParserTest.java index 3f4b2d657d..c686a9cb97 100644 --- a/grobid-core/src/test/java/org/grobid/core/engines/AffiliationAddressParserTest.java +++ b/grobid-core/src/test/java/org/grobid/core/engines/AffiliationAddressParserTest.java @@ -1,33 +1,28 @@ package org.grobid.core.engines; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.AfterClass; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import static org.hamcrest.Matchers.hasSize; -import static org.hamcrest.Matchers.nullValue; -import static org.junit.Assert.assertThat; -import static org.hamcrest.CoreMatchers.is; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - import com.google.common.base.Joiner; - +import org.grobid.core.GrobidModels; import org.grobid.core.analyzers.GrobidAnalyzer; import org.grobid.core.data.Affiliation; import org.grobid.core.factory.GrobidFactory; import org.grobid.core.features.FeaturesVectorAffiliationAddress; import org.grobid.core.layout.LayoutToken; -import org.grobid.core.main.LibraryLoader; import org.grobid.core.utilities.GrobidProperties; -import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.LayoutTokensUtil; +import org.grobid.core.utilities.OffsetPosition; +import org.junit.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.Matchers.*; +import static org.junit.Assert.assertThat; public class AffiliationAddressParserTest { @@ -43,13 +38,13 @@ public class AffiliationAddressParserTest { @Before public void setUp() throws Exception { - this.target = new AffiliationAddressParser(); + this.target = new AffiliationAddressParser(GrobidModels.DUMMY); this.analyzer = GrobidAnalyzer.getInstance(); } @BeforeClass public static void init() { - LibraryLoader.load(); +// LibraryLoader.load(); GrobidProperties.getInstance(); } @@ -257,4 +252,109 @@ public void shouldExtractMultipleAffiliations() throws Exception { is("University of Madness") ); } + + @Test + @Ignore("This test is used to show the failing input data") + public void testResultExtractionLayoutTokensFromDLOutput() throws Exception { + String result = "\n" + + "\n" + + "Department\tdepartment\tD\tDe\tDep\tDepa\tt\tnt\tent\tment\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\tI-\n" + + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t1\t0\tNOPUNCT\txx\t\t\n" + + "Radiation\tradiation\tR\tRa\tRad\tRadi\tn\ton\tion\ttion\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + "Oncology\toncology\tO\tOn\tOnc\tOnco\ty\tgy\togy\tlogy\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\t\n" + + "San\tsan\tS\tSa\tSan\tSan\tn\tan\tSan\tSan\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\tNOPUNCT\tXxx\t\tI-\n" + + "Camillo\tcamillo\tC\tCa\tCam\tCami\to\tlo\tllo\tillo\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t\t\n" + + "Forlanini\tforlanini\tF\tFo\tFor\tForl\ti\tni\tini\tnini\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + "Hospital\thospital\tH\tHo\tHos\tHosp\tl\tal\ttal\tital\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\t\n" + + "Circonvallazione\tcirconvallazione\tC\tCi\tCir\tCirc\te\tne\tone\tione\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\tI-\n" + + "Gianicolense\tgianicolense\tG\tGi\tGia\tGian\te\tse\tnse\tense\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\t\n" + + "87\t87\t8\t87\t87\t87\t7\t87\t87\t87\tLINESTART\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdd\t\tI-\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t\t\n" + + "00152\t00152\t0\t00\t001\t0015\t2\t52\t152\t0152\tLINEIN\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdddd\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\t\n" + + "Rome\trome\tR\tRo\tRom\tRome\te\tme\tome\tRome\tLINEIN\tINITCAP\tNODIGIT\t0\t1\t0\t0\t1\t0\tNOPUNCT\tXxxx\t\tI-\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t1\t0\tCOMMA\t,\t\t\n" + + "Italy\titaly\tI\tIt\tIta\tItal\ty\tly\taly\ttaly\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t1\tNOPUNCT\tXxxx\t\tI-\n" + + ";\t;\t;\t;\t;\t;\t;\t;\t;\t;\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tPUNCT\t;\t\t\n"; + + List tokenizations = Arrays.stream(result.split("\n")) + .map(row -> new LayoutToken(row.split("\t")[0])) + .collect(Collectors.toList()); + + assertThat(target.resultExtractionLayoutTokens(result, tokenizations), hasSize(greaterThan(0))); + } + + + @Test + public void testResultExtractionLayoutTokensFromCRFOutput() throws Exception { + String result = "MD\tmd\tM\tMD\tMD\tMD\tD\tMD\tMD\tMD\tLINESTART\tALLCAPS\tNODIGIT\t0\t0\t0\t0\t1\t0\tNOPUNCT\tXX\t\tI-\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\tI-\n" + + "Department\tdepartment\tD\tDe\tDep\tDepa\tt\tnt\tent\tment\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\tI-\n" + + "of\tof\to\tof\tof\tof\tf\tof\tof\tof\tLINEIN\tNOCAPS\tNODIGIT\t0\t0\t1\t0\t1\t0\tNOPUNCT\txx\t\t\n" + + "Radiation\tradiation\tR\tRa\tRad\tRadi\tn\ton\tion\ttion\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + "Oncology\toncology\tO\tOn\tOnc\tOnco\ty\tgy\togy\tlogy\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\tI-\n" + + "San\tsan\tS\tSa\tSan\tSan\tn\tan\tSan\tSan\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\tNOPUNCT\tXxx\t\tI-\n" + + "Camillo\tcamillo\tC\tCa\tCam\tCami\to\tlo\tllo\tillo\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t\t\n" + + "Forlanini\tforlanini\tF\tFo\tFor\tForl\ti\tni\tini\tnini\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + "Hospital\thospital\tH\tHo\tHos\tHosp\tl\tal\ttal\tital\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t1\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\tI-\n" + + "Circonvallazione\tcirconvallazione\tC\tCi\tCir\tCirc\te\tne\tone\tione\tLINESTART\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\tI-\n" + + "Gianicolense\tgianicolense\tG\tGi\tGia\tGian\te\tse\tnse\tense\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tXxxx\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\tI-\n" + + "87\t87\t8\t87\t87\t87\t7\t87\t87\t87\tLINESTART\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdd\t\tI-\n" + + "-\t-\t-\t-\t-\t-\t-\t-\t-\t-\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tHYPHEN\t-\t\t\n" + + "00152\t00152\t0\t00\t001\t0015\t2\t52\t152\t0152\tLINEIN\tNOCAPS\tALLDIGIT\t0\t0\t0\t0\t0\t0\tNOPUNCT\tdddd\t\t\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tCOMMA\t,\t\tI-\n" + + "Rome\trome\tR\tRo\tRom\tRome\te\tme\tome\tRome\tLINEIN\tINITCAP\tNODIGIT\t0\t1\t0\t0\t1\t0\tNOPUNCT\tXxxx\t\tI-\n" + + ",\t,\t,\t,\t,\t,\t,\t,\t,\t,\tLINEIN\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t1\t0\tCOMMA\t,\t\tI-\n" + + "Italy\titaly\tI\tIt\tIta\tItal\ty\tly\taly\ttaly\tLINEIN\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t1\tNOPUNCT\tXxxx\t\tI-\n" + + ";\t;\t;\t;\t;\t;\t;\t;\t;\t;\tLINEEND\tALLCAPS\tNODIGIT\t1\t0\t0\t0\t0\t0\tPUNCT\t;\t\t"; + + List tokenizations = Arrays.stream(result.split("\n")) + .map(row -> new LayoutToken(row.split("\t")[0])) + .collect(Collectors.toList()); + + assertThat(target.resultExtractionLayoutTokens(result, tokenizations), hasSize(greaterThan(0))); + } + + @Test + public void testGetAffiliationBlocksFromSegments_1() throws Exception { + String block1 = "Department of science, University of Science, University of Madness"; + List tokBlock1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block1); + tokBlock1.stream().forEach(t -> t.setOffset(t.getOffset() + 100)); + + String block2 = "Department of mental health, University of happyness, Italy"; + List tokBlock2 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block2); + tokBlock2.stream().forEach(t -> t.setOffset(t.getOffset() + 500)); + + List affiliationBlocksFromSegments = AffiliationAddressParser.getAffiliationBlocksFromSegments(Arrays.asList(tokBlock1, tokBlock2)); + + assertThat(affiliationBlocksFromSegments, hasSize(22)); + assertThat(affiliationBlocksFromSegments.get(0), is(not(startsWith("\n")))); + assertThat(affiliationBlocksFromSegments.get(11), is("\n")); + } + + @Test + public void testGetAffiliationBlocksFromSegments_2() throws Exception { + String block1 = "Department of science, University of Science, University of Madness"; + List tokBlock1 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block1); + tokBlock1.stream().forEach(t -> t.setOffset(t.getOffset() + 100)); + + String block2 = "Department of mental health, University of happyness, Italy"; + List tokBlock2 = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(block2); + tokBlock2.stream().forEach(t -> t.setOffset(t.getOffset() + 100 + tokBlock1.size())); + + List affiliationBlocksFromSegments = AffiliationAddressParser.getAffiliationBlocksFromSegments(Arrays.asList(tokBlock1, tokBlock2)); + + assertThat(affiliationBlocksFromSegments, hasSize(21)); + assertThat(affiliationBlocksFromSegments.get(0), is(not(startsWith("\n")))); + assertThat(affiliationBlocksFromSegments.get(11), is(not("@newline"))); + + } } diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java index 11d11cfcda..8672a11801 100644 --- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java @@ -44,6 +44,47 @@ public void testTokenPositionsUrlPattern_URL_shouldReturnCorrectInterval() throw assertThat(LayoutTokensUtil.toText(tokenisedInput.subList(FirstURL.start, FirstURL.end + 1)), is("http:// github.com/myUsername/MyProject")); } + @Test + public void testCharacterPositionsUrlPattern_URLStartingWithWWW_shouldReturnCorrectInterval() throws Exception { + final String input = "This work was distributed on www. github.com/myUsername/MyProject"; + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + List offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition FirstURL = offsetPositions.get(0); + assertThat(input.substring(FirstURL.start, FirstURL.end), is("www. github.com/myUsername/MyProject")); + } + + @Test + public void testCharacterPositionsUrlPattern_URLStartingWithHTTPS_shouldReturnCorrectInterval() throws Exception { + final String input = "This work was distributed on https:// www.github.com/myUsername/MyProject"; + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + List offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition FirstURL = offsetPositions.get(0); + assertThat(input.substring(FirstURL.start, FirstURL.end), is("https:// www.github.com/myUsername/MyProject")); + } + + /** + * This test is to confirm the limitation of this method using the regex, where we prefer failing on some cases + * rather than have a lot of false positive. This method will be anyway complemented with the annotated links in + * the PDF (if available). + */ + @Test + public void testCharacterPositionsUrlPattern_URLTruncated_shouldReturnCorrectIntervalWithmissingPartOfURL() throws Exception { + final String input = "This work was distributed on https://www. github.com/myUsername/MyProject"; + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + List offsetPositions = Lexicon.characterPositionsUrlPattern(tokenisedInput); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition FirstURL = offsetPositions.get(0); + assertThat(input.substring(FirstURL.start, FirstURL.end), is("https://www")); + } + @Test @Ignore("This test will fail, it can be used to test a real case when updating the regular exception") public void testCharacterPositionsUrlPattern_URL_shouldReturnCorrectInterval_2() throws Exception { @@ -368,7 +409,7 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC PDFAnnotation annotation1 = new PDFAnnotation(); annotation1.setPageNumber(10); List boundingBoxes = new ArrayList<>(); - boundingBoxes.add(BoundingBox.fromPointAndDimensions(10, 378.093, 625.354, 167.51799999999997, 10.599999999999909)); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(10, 378.093, 625.354, 167.51799999999997, 10.599999999999909)); annotation1.setBoundingBoxes(boundingBoxes); annotation1.setDestination("https://github.com/shijuanchen/shift_cult"); annotation1.setType(PDFAnnotation.Type.URI); @@ -376,7 +417,7 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC PDFAnnotation annotation2 = new PDFAnnotation(); annotation2.setPageNumber(10); List boundingBoxes2 = new ArrayList<>(); - boundingBoxes2.add(BoundingBox.fromPointAndDimensions(10, 475.497, 637.854, 77.26,10.60)); + boundingBoxes2.add(BoundingBox.fromPointAndDimensions(10, 475.497, 637.854, 77.26, 10.60)); annotation2.setBoundingBoxes(boundingBoxes2); annotation2.setDestination("https://sites.google.com/view/shijuanchen/research/shift_cult"); annotation2.setType(PDFAnnotation.Type.URI); @@ -391,6 +432,64 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC assertThat(input.substring(url1.start, url1.end), is("https://sites.google. \ncom/view/shijuanchen/research/shift_cult")); } + @Test + public void testCharacterPositionsUrlPatternWithPDFAnnotations_DuplicatedMatchingPDFAnnotations_shouldReturnCorrectIntervalBasedOnText4() throws Exception { + final String input = "Google Earth Engine applications to visualize the \n" + + "datasets: https://github.com/shijuanchen/shift_cult \n" + + "Map products visualization: https://sites.google. \n" + + "com/view/shijuanchen/research/shift_cult \n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + LayoutToken lastTokenOfTheURL1 = tokenisedInput.get(28); + lastTokenOfTheURL1.setPage(10); + lastTokenOfTheURL1.setX(504.75295121951217); + lastTokenOfTheURL1.setY(626.353); + lastTokenOfTheURL1.setWidth(40.858048780487806); + lastTokenOfTheURL1.setHeight(9.3999); + + LayoutToken lastTokenOfTheURL2 = tokenisedInput.get(44); + lastTokenOfTheURL2.setPage(10); + lastTokenOfTheURL2.setX(526.9964666666667); + lastTokenOfTheURL2.setY(638.853); + lastTokenOfTheURL2.setWidth(22.0712); + lastTokenOfTheURL2.setHeight(9.3999); + + PDFAnnotation annotation1 = new PDFAnnotation(); + annotation1.setPageNumber(10); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(10, 378.093, 625.354, 167.51799999999997, 10.599999999999909)); + annotation1.setBoundingBoxes(boundingBoxes); + annotation1.setDestination("https://github.com/shijuanchen/shift_cult"); + annotation1.setType(PDFAnnotation.Type.URI); + + PDFAnnotation annotation2 = new PDFAnnotation(); + annotation2.setPageNumber(10); + List boundingBoxes2 = new ArrayList<>(); + boundingBoxes2.add(BoundingBox.fromPointAndDimensions(10, 475.497, 637.854, 77.26, 10.60)); + annotation2.setBoundingBoxes(boundingBoxes2); + annotation2.setDestination("https://www.google.com"); + annotation2.setType(PDFAnnotation.Type.URI); + + PDFAnnotation annotation3 = new PDFAnnotation(); + annotation3.setPageNumber(10); + List boundingBoxes3 = new ArrayList<>(); + boundingBoxes3.add(BoundingBox.fromPointAndDimensions(10, 475.497, 637.854, 77.26, 10.60)); + annotation3.setBoundingBoxes(boundingBoxes3); + annotation3.setDestination("https://sites.google.com/view/shijuanchen/research/shift_cult"); + annotation3.setType(PDFAnnotation.Type.URI); + + List pdfAnnotations = List.of(annotation1, annotation2, annotation3); + + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + + assertThat(offsetPositions, hasSize(2)); + OffsetPosition url0 = offsetPositions.get(0); + assertThat(input.substring(url0.start, url0.end), is("https://github.com/shijuanchen/shift_cult")); + OffsetPosition url1 = offsetPositions.get(1); + assertThat(input.substring(url1.start, url1.end), is("https://sites.google. \ncom/view/shijuanchen/research/shift_cult")); + } + + @Test public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText5() throws Exception { final String input = ", accessible through the University of Hawaii Sea Level Center with station ID of UHSLC ID 57 \n" + @@ -453,4 +552,235 @@ public void testGetTokenPosition() throws Exception { } + @Test + public void testCharacterPositionsUrlPattern_URLRegexMatchesTooLittle_shouldReturnCorrectInterval_1() throws Exception { + final String input = "We appreciate assistance from The Research Support Center, Research Center for Human Disease Modeling, \n" + + "and Kyushu University Graduate School of Medical Sciences. We thank Dr. Mitsuru Watanabe and Ms. Eriko \n" + + "Matsuo from the Department of Neurology, Kyushu University, for the technical assistance in the flow cytometric \n" + + "analysis. We thank Ms. Sachiko Koyama and Hideko Noguchi from the Department of Neuropathology, Kyushu \n" + + "University, for excellent technical assistance in the histological analysis. We thank Mr. Tetsuo Kishi from the \n" + + "Department of Medicine, Kyushu University School of Medicine for the immunohistochemical analysis. We \n" + + "thank J. Ludovic Croxford, PhD, from Edanz (https:// jp. edanz. com/ ac) for editing a draft of this manuscript."; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + //These have to overlap with the regex output to make sure that the annotation is selected + LayoutToken lastTokenOfTheURL1 = tokenisedInput.get(219); + lastTokenOfTheURL1.setPage(15); + lastTokenOfTheURL1.setX(322.49060000000003); + lastTokenOfTheURL1.setY(454.586); + lastTokenOfTheURL1.setWidth(16.338); + lastTokenOfTheURL1.setHeight(9.099); + + LayoutToken lastTokenOfTheURL2 = tokenisedInput.get(220); + lastTokenOfTheURL2.setPage(15); + lastTokenOfTheURL2.setX(338.8286); + lastTokenOfTheURL2.setY(454.586); + lastTokenOfTheURL2.setWidth(3.2676); + lastTokenOfTheURL2.setHeight(9.099); + + LayoutToken lastTokenOfTheURL3 = tokenisedInput.get(221); + lastTokenOfTheURL3.setPage(15); + lastTokenOfTheURL3.setX(342.0962); + lastTokenOfTheURL3.setY(454.586); + lastTokenOfTheURL3.setWidth(3.2676); + lastTokenOfTheURL3.setHeight(9.099); + + LayoutToken lastTokenOfTheURL4 = tokenisedInput.get(222); + lastTokenOfTheURL4.setPage(15); + lastTokenOfTheURL4.setX(345.3638); + lastTokenOfTheURL4.setY(454.586); + lastTokenOfTheURL4.setWidth(3.2676); + lastTokenOfTheURL4.setHeight(9.099); + + LayoutToken lastTokenOfTheURL5 = tokenisedInput.get(224); + lastTokenOfTheURL5.setPage(15); + lastTokenOfTheURL5.setX(348.667); + lastTokenOfTheURL5.setY(454.586); + lastTokenOfTheURL5.setWidth(5.868599999999999); + lastTokenOfTheURL5.setHeight(9.099); + + LayoutToken lastTokenOfTheURL6 = tokenisedInput.get(225); + lastTokenOfTheURL6.setPage(15); + lastTokenOfTheURL6.setX(354.5356); + lastTokenOfTheURL6.setY(454.586); + lastTokenOfTheURL6.setWidth(2.9342999999999995); + lastTokenOfTheURL6.setHeight(9.099); + + LayoutToken lastTokenOfTheURL7 = tokenisedInput.get(227); + lastTokenOfTheURL7.setPage(15); + lastTokenOfTheURL7.setX(357.514); + lastTokenOfTheURL7.setY(454.586); + lastTokenOfTheURL7.setWidth(19.5645); + lastTokenOfTheURL7.setHeight(9.099); + + LayoutToken lastTokenOfTheURL10 = tokenisedInput.get(231); + lastTokenOfTheURL10.setPage(15); + lastTokenOfTheURL10.setX(395.106375); + lastTokenOfTheURL10.setY(454.586); + lastTokenOfTheURL10.setWidth(4.690125); + lastTokenOfTheURL10.setHeight(9.099); + + LayoutToken lastTokenOfTheURL11 = tokenisedInput.get(233); + lastTokenOfTheURL11.setPage(15); + lastTokenOfTheURL11.setX(399.842); + lastTokenOfTheURL11.setY(454.586); + lastTokenOfTheURL11.setWidth(7.295399999999999); + lastTokenOfTheURL11.setHeight(9.099); + + LayoutToken lastTokenOfTheURL12 = tokenisedInput.get(234); + lastTokenOfTheURL12.setPage(15); + lastTokenOfTheURL12.setX(407.13739999999996); + lastTokenOfTheURL12.setY(454.586); + lastTokenOfTheURL12.setWidth(3.6476999999999995); + lastTokenOfTheURL12.setHeight(9.099); + + PDFAnnotation annotation1 = new PDFAnnotation(); + annotation1.setPageNumber(15); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(15, 322.37, 451.55, 85.305, 12.140999999999963)); + annotation1.setBoundingBoxes(boundingBoxes); + annotation1.setDestination("https://jp.edanz.com/ac"); + annotation1.setType(PDFAnnotation.Type.URI); + + List pdfAnnotations = List.of(annotation1); + + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url0 = offsetPositions.get(0); + assertThat(input.substring(url0.start, url0.end), is("https:// jp. edanz. com/ ac")); + } + + @Test + public void testCharacterPositionsUrlPattern_URLRegexMatchesTooLittle_shouldReturnCorrectInterval_2() throws Exception { + /* + * This test only aims for the last link + */ + final String input = ", \n" + + "based on the sorted BAM files generated by using BWA-MEM (v.0.7.17; http:// \n" + + "biobwa.sourceforge.net/) and SAMtools (v1.546; http://www.htslib.org/). MetaBAT2 \n" + + "was applied to bin the assemblies with contig depth results under the default \n" + + "parameters (minimum contig length ≥ 1500 bp). CheckM v.1.0.3 (https://ecogenom \n" + + "ics.github.io/CheckM/) with the lineage_wf workflow was used to estimate the complete \n" + + "ness and contamination of MAGs "; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + //These have to overlap with the regex output to make sure that the annotation is selected + LayoutToken lastTokenOfTheURL1 = tokenisedInput.get(132); + lastTokenOfTheURL1.setPage(5); + lastTokenOfTheURL1.setX(331.7820588235294); + lastTokenOfTheURL1.setY(467.682); + lastTokenOfTheURL1.setWidth(4.307294117647059); + lastTokenOfTheURL1.setHeight(10.818); + + LayoutToken lastTokenOfTheURL2 = tokenisedInput.get(133); + lastTokenOfTheURL2.setPage(5); + lastTokenOfTheURL2.setX(336.08935294117646); + lastTokenOfTheURL2.setY(467.682); + lastTokenOfTheURL2.setWidth(4.307294117647059); + lastTokenOfTheURL2.setHeight(10.818); + + LayoutToken lastTokenOfTheURL3 = tokenisedInput.get(134); + lastTokenOfTheURL3.setPage(5); + lastTokenOfTheURL3.setX(340.39664705882353); + lastTokenOfTheURL3.setY(467.682); + lastTokenOfTheURL3.setWidth(34.45835294117647); + lastTokenOfTheURL3.setHeight(10.818); + + LayoutToken lastTokenOfTheURL5 = tokenisedInput.get(137); + lastTokenOfTheURL5.setPage(5); + lastTokenOfTheURL5.setX(41.9999); + lastTokenOfTheURL5.setY(479.682); + lastTokenOfTheURL5.setWidth(11.487272727272726); + lastTokenOfTheURL5.setHeight(10.818); + + PDFAnnotation annotation1 = new PDFAnnotation(); + annotation1.setPageNumber(5); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(5, 41.00, 468.50, 335.00, 23.00)); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(5, 134.01, 454.50, 170.18, 24.00)); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(5, 123.68, 481.50, 0.00, 9.00)); + annotation1.setBoundingBoxes(boundingBoxes); + annotation1.setDestination("https://ecogenomics.github.io/CheckM/"); + annotation1.setType(PDFAnnotation.Type.URI); + + List pdfAnnotations = List.of(annotation1); + + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + + assertThat(offsetPositions, hasSize(3)); + OffsetPosition url2 = offsetPositions.get(2); + assertThat(input.substring(url2.start, url2.end), is("https://ecogenom \n" + + "ics.github.io/CheckM/")); + } + + @Test + public void testCharacterPositionsUrlPattern_URLContainsSpuriosBreklineHypen_shouldReturnCorrectInterval() throws Exception { + /* + * This test only aims for the last link + */ + final String input = "Details and code for using the IntOGen framework are available at \n" + + "https://intogen.readthedocs.io/en/latest/index.html. The specific \n" + + "code to perform this analysis is available in the Genomics England \n" + + "research environment (https://re-docs.genomicsengland.co.uk/ \n" + + "access/) under /re_gecip/shared_allGeCIPs/pancancer_drivers/code/. \n" + + "The link to becoming a member of the Genomics England research \n" + + "network and obtaining access can be found at https://www.genomic-\n" + + "sengland.co.uk/research/academic/join-gecip. The code to perform \n" + + "the canSAR chemogenomics analysis is available through Zenodo \n" + + "(https://doi.org/10.5281/zenodo.8329054) (ref. "; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + //These have to overlap with the regex output to make sure that the annotation is selected + LayoutToken lastTokenOfTheURL0 = tokenisedInput.get(153); + lastTokenOfTheURL0.setPage(11); + lastTokenOfTheURL0.setX(523.39535); + lastTokenOfTheURL0.setY(436.559); + lastTokenOfTheURL0.setWidth(4.205850000000001); + lastTokenOfTheURL0.setHeight(8.217); + + LayoutToken lastTokenOfTheURL1 = tokenisedInput.get(154); + lastTokenOfTheURL1.setPage(11); + lastTokenOfTheURL1.setX(527.6012); + lastTokenOfTheURL1.setY(436.559); + lastTokenOfTheURL1.setWidth(29.44095); + lastTokenOfTheURL1.setHeight(8.217); + + LayoutToken lastTokenOfTheURL2 = tokenisedInput.get(155); + lastTokenOfTheURL2.setPage(11); + lastTokenOfTheURL2.setX(557.04215); + lastTokenOfTheURL2.setY(436.559); + lastTokenOfTheURL2.setWidth(8.217); + lastTokenOfTheURL2.setHeight(10.818); + + LayoutToken lastTokenOfTheURL3 = tokenisedInput.get(157); + lastTokenOfTheURL3.setPage(11); + lastTokenOfTheURL3.setX(306.141); + lastTokenOfTheURL3.setY(447.309); + lastTokenOfTheURL3.setWidth(31.902000000000005); + lastTokenOfTheURL3.setHeight(8.217); + + PDFAnnotation annotation1 = new PDFAnnotation(); + annotation1.setPageNumber(11); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(11,477.14,434.60,84.12,10.18)); +// boundingBoxes.add(BoundingBox.fromPointAndDimensions(5, 134.01, 454.50, 170.18, 24.00)); +// boundingBoxes.add(BoundingBox.fromPointAndDimensions(5, 123.68, 481.50, 0.00, 9.00)); + annotation1.setBoundingBoxes(boundingBoxes); + annotation1.setDestination("https://www.genomicsengland.co.uk/research/academic/join-gecip"); + annotation1.setType(PDFAnnotation.Type.URI); + + List pdfAnnotations = List.of(annotation1); + + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + + assertThat(offsetPositions, hasSize(4)); + OffsetPosition url2 = offsetPositions.get(2); + assertThat(input.substring(url2.start, url2.end), is("https://www.genomic-\n" + + "sengland.co.uk/research/academic/join-gecip")); + } + } diff --git a/grobid-service/src/main/resources/web/index.html b/grobid-service/src/main/resources/web/index.html index cbfe999ff6..fbee4e457a 100644 --- a/grobid-service/src/main/resources/web/index.html +++ b/grobid-service/src/main/resources/web/index.html @@ -138,6 +138,9 @@

+ diff --git a/grobid-trainer/resources/dataset/fulltext/corpus/tei/s41598-020-58065-9.training.fulltext.tei.xml b/grobid-trainer/resources/dataset/fulltext/corpus/tei/s41598-020-58065-9.training.fulltext.tei.xml index 59bdd0ce98..39251fe8a6 100644 --- a/grobid-trainer/resources/dataset/fulltext/corpus/tei/s41598-020-58065-9.training.fulltext.tei.xml +++ b/grobid-trainer/resources/dataset/fulltext/corpus/tei/s41598-020-58065-9.training.fulltext.tei.xml @@ -14,11 +14,11 @@

The thermodynamic parameters of the LaH 10 superconductor were calculated by means of Eliashberg equations on the imaginary axis 23 :

- π μ Δ = Ω − Ω − Ω + Δ Δ =− Z k T K [ ( ) ( )] , n n B m M M n m m m m m 2 2 + π μ Δ = Ω − Ω − Ω + Δ Δ =− Z k T K [ ( ) ( )] , () n n B m M M n m m m m m 2 2

and

- π = + Ω − Ω Ω + Δ . =− Z kT K Z 1 ( ) n B m M M n m m m m n m 2 2 + π = + Ω − Ω Ω + Δ . =− Z kT K Z 1 ( ) () n B m M M n m m m m n m 2 2

The symbols Δ = Δ Ω i ( ) n n and = Z Z i ( ) n n denote the order parameter and the wave function renormalization factor, respectively. The quantity Ω n represents the Matsubara frequency: π Ω = k T n ( 2 1) n B , where k B is the Boltzmann constant. The pairing kernel is defined by: λ Ω − Ω = Ω − Ω +Ω K( ) n m ( ) C n m C 2 2 2 , where λ denotes the elec-tron-phonon coupling constant. We determined the value of λ on the basis of experimental data 20,21 and the condition: Δ = = = [ ] 0 n T T 1 C . The fitting between the theory and the experimental results is presented in Fig. 1. We obtained λ a = 2.187 for p a = 150 GPa and λ b = 2.818 for p b = 190 GPa. The symbol Ω C represents the character-istic phonon frequency, its value being assumed as Ω C = 100 meV.

@@ -32,19 +32,19 @@
Figure 1. The dependence of the maximum value of the order parameter on the electron-phonon coupling constant. We consider two cases: = T 215 C a K (p a = 150 GPa) and = T 260 C b K (p b = 190 GPa).
- ρ π Δ = − Ω + Δ − Ω ×      Ω + Δ       = F k T Z Z (0) 2 ( ) , B n M n n n n S n N n n n 1 2 2 2 2 + ρ π Δ = − Ω + Δ − Ω ×      Ω + Δ       = F k T Z Z (0) 2 ( ) , () B n M n n n n S n N n n n 1 2 2 2 2

where ρ(0) denotes the value of electronic density of states at Fermi surface; Z n S and Z n N are the wave function normalization factors for the superconducting and the normal state, respectively. Note that ΔF is equal to zero exactly for T = T C . This fact results from the overt dependence of free energy on solutions of Eliashberg equations (Δ n and Z n ) that have been adjusted to the experimental value of critical temperature by appropriate selection of electron-phonon coupling constant (see Fig. 1). Thermodynamic critical field should be calculated from the formula:

- ρ π ρ = − Δ . H F (0) 8 [ / (0)] C + ρ π ρ = − Δ . H F (0) 8 [ / (0)] () C

The difference in the specific heat between the superconducting and the normal state (ΔC = C S − C N ) is given by:

- ρ ρ Δ = − Δ . C T k k T d F d k T ( ) (0) [ / (0)] ( ) B B B 2 2 + ρ ρ Δ = − Δ . C T k k T d F d k T ( ) (0) [ / (0)] ( ) () B B B 2 2

The most convenient way of estimation the specific heat for the normal state is using the expression:

- ρ γ = . C T k k T ( ) (0) N B B + ρ γ = . C T k k T ( ) (0) () N B B
Figure 2. The dependence of the order parameter on temperature. The insets present the influence of temperature on the value of effective electron mass to the band electron mass ratio. Blue or red disks represent numerical results. Black curves were obtained from the analytical formulae: Δ = Δ Γ T T T T ( ) ( ) 1 ( / ) C 0 and = + Γ m m Z T Z T T T ZT / [ ( ) ( )]( / ) ( ) e e C C 0 0 , where λ = + Z T ( ) 1 C , Γ a = 3.5 and Γ b = 3.4. The predictions of the BCS theory we marked with grey circles.
@@ -59,28 +59,28 @@

Nevertheless, a sensible qualitative analysis can be made with respect to the influence of the atomic mass of the X element on a value of the critical temperature (since the mass of the X element determines Ω max ). In this regard, let us refer to the theoretical results obtained within the Eliashberg formalism for H 2 S and H 3 S superconduc-tors 5,6 . They prove that contributions to the Eliashberg function (α Ω F( ) 2 ) coming from sulphur and from hydro-gen are separated due to a huge difference between atomic masses of these two elements. To be precise, the electron-phonon interaction derived from sulphur is crucial in the frequency range from 0 meV to Ω max S equal to about 70 meV, while the contribution derived from hydrogen (Ω = 220 max H meV) is significant above ~100 meV. It is noteworthy that we come upon a similar situation in the case of the LaH 10 compound 30 . Therefore the follow-ing factorization of the Eliashberg function for the LaXH compound can be assumed:

α λ θ λ θ λ θ Ω =         − Ω +          − Ω +          − Ω F( ) ( ) ( ) ( ) , - 2 L a max La 2 max La X max X 2 max X H max H 2 max H + () 2 L a max La 2 max La X max X 2 max X H max H 2 max H

where λ La , λ X , and λ H are the contributions to the electron-phonon coupling constant derived from both metals (La, X) and hydrogen, respectively. Similarly, the symbols Ω max La , Ω max X , and Ω max H represent the respective maxi-mum phonon frequencies. The value of the critical temperature can be assessed from the generalised formula of the BCS theory 7 :

- λ λ λμ = . − . + − + . k T f f 1 27 exp 1 14(1 ) (1 0 163 ) , B C 1 2 ln + λ λ λμ = . − . + − + . k T f f 1 27 exp 1 14(1 ) (1 0 163 ) , () B C 1 2 ln

while the symbols appearing in Eq. (8) are defined in Table 1.

Let us calculate explicitly the relevant quantities:

- λ λ λ λ = + + , + λ λ λ λ = + + , ()
La X H Quantity λ = α +∞ Ω Ω d 2 F 0 2 ( ) ( ) , Ω = λ α +∞ d exp l n( ) F ln 2 0 2 ( ) , α Ω = Ω Ω λ +∞ d F( ) 2 2 0 2 , = + λ Λ ( ) f 1 1 1 3 2 1 3 , = + λ λ           + Λ f 1 2 2 ln 1 2 2 2 2 , Λ 1 = 2.4 − 0.14μ ' , μ Λ = . + Ω Ω (0 1 9 )( / ) 2 2 ln . Table 1. The quantities: λ (electron-phonon coupling constant), Ω ln (logarithmic phonon frequency), Ω 2 (second moment of the normalized weight function), f 1 (strong-coupling correction function), and f 2 (shape correction function) μ.
λ λ λ λ λ λ λ λ λ λ λ λ Ω = + +   Ω    × + +   Ω    × + +   Ω    exp l n( ) 1 2 exp l n( ) 1 2 exp l n( ) 1 2 , - + () ln La La X H max La X La X H max X H La X H max H

and

- λ λ λ λ λ λ λ λ λ λ λ λ Ω = + + + + + + + + . ( ) 2 ( ) 2 ( ) 2 2 La La X H max La 2 X La X H max X 2 H La X H max H 2 + λ λ λ λ λ λ λ λ λ λ λ λ Ω = + + + + + + + + . ( ) 2 ( ) 2 ( ) 2 () 2 La La X H max La 2 X La X H max X 2 H La X H max H 2

We are going to consider the case Ω < Ω < ~40 meV 100 meV max La max X . It means that we are interested in such an X element, the contribution of which to the Eliashberg function fills the gap between contributions com-ing from lanthanum and hydrogen. It can be assumed that 0 < λ X < 1, while keeping in mind that λ La = 0.68 31 . Additionally, the previous calculations discussed in the work allow to write that λ La + λ H is equal to λ a = 2.187 for p a = 150 GPa or to λ b = 2.818 for p b = 190 GPa. The quantity  μ occurring in the Eq. (8) serves now as the fitting parameter. One should remember that the formula for the critical temperature given by the Eq. (8) was derived with the use of significant simplifying assumptions (the value of the cut-off frequency is neglected, as well as the retardation effects modeled by the Matsubara frequency). Therefore the value of the Coulomb pseudopo-tential determined from the full Eliashberg equations usually differs from the value of  μ calculated analytically. The experimental data for the LaH 10 superconductor can be reproduced using Eq. (8) and assuming that μ = . 0 170 a and μ = . 0 276 b .

diff --git a/grobid-trainer/resources/dataset/header/corpus/tei/10.1371_journal.pone.0210163.training.header.tei.xml b/grobid-trainer/resources/dataset/header/corpus/tei/10.1371_journal.pone.0210163.training.header.tei.xml index 7da5707c04..deab9cc6f4 100644 --- a/grobid-trainer/resources/dataset/header/corpus/tei/10.1371_journal.pone.0210163.training.header.tei.xml +++ b/grobid-trainer/resources/dataset/header/corpus/tei/10.1371_journal.pone.0210163.training.header.tei.xml @@ -65,7 +65,7 @@ © 2019 Rake, Haeussler. This is an open access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited. Data Availability Statement: -
Data on clinical trials conducted in India obtained from ClinicalTrials.gov is made available in the corresponding Supporting Information file. Other authors can also access this information through ClinicalTrials.gov. We obtained publication data from the Scopus database which is a proprietary database (www.scopus.com). Researchers interested in replicating our study can access data on trial-related publications following the search procedure described in the paper. Researchers do not need special privileges to access the Scopus database, however, a subscription may be required. The authors did not have special access privileges to the data.
+ Data on clinical trials conducted in India obtained from ClinicalTrials.gov is made available in the corresponding Supporting Information file. Other authors can also access this information through ClinicalTrials.gov. We obtained publication data from the Scopus database which is a proprietary database (www.scopus.com). Researchers interested in replicating our study can access data on trial-related publications following the search procedure described in the paper. Researchers do not need special privileges to access the Scopus database, however, a subscription may be required. The authors did not have special access privileges to the data. Funding: The authors received no specific funding for this work. diff --git a/grobid-trainer/resources/dataset/header/corpus/tei/4EE4CA4527C3CBD0A238BC82EF137E0445AEE879.training.header.tei.xml b/grobid-trainer/resources/dataset/header/corpus/tei/4EE4CA4527C3CBD0A238BC82EF137E0445AEE879.training.header.tei.xml index 1f5b45d79c..ab1d6c68df 100644 --- a/grobid-trainer/resources/dataset/header/corpus/tei/4EE4CA4527C3CBD0A238BC82EF137E0445AEE879.training.header.tei.xml +++ b/grobid-trainer/resources/dataset/header/corpus/tei/4EE4CA4527C3CBD0A238BC82EF137E0445AEE879.training.header.tei.xml @@ -70,7 +70,7 @@ Received 2 July; accepted 29 October 2002 ; - doi:10.1038/nature01278. + doi:10.1038/nature01278. diff --git a/mkdocs.yml b/mkdocs.yml index d6b9b08d51..609bec6e55 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,10 +4,10 @@ repo_name: GitHub theme: readthedocs site_description: Documentation for GROBID docs_dir: doc +extra_css: + - css/custom.css plugins: - search -theme: - name: readthedocs nav: - Home: 'index.md' - About: