From 0f9a1dc54f6b0d660b50ad12b642a99ccc64acb3 Mon Sep 17 00:00:00 2001 From: lopez Date: Thu, 1 Feb 2024 13:10:13 +0100 Subject: [PATCH] fix copyright class naming --- Readme.md | 7 ++++--- .../main/java/org/grobid/core/data/CopyrightsLicense.java | 4 ++-- .../java/org/grobid/core/engines/LicenseClassifier.java | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/Readme.md b/Readme.md index b6b74e46cf..0547371813 100644 --- a/Readme.md +++ b/Readme.md @@ -33,8 +33,9 @@ The following functionalities are available: - __Consolidation/resolution of the extracted bibliographical references__ using the [biblio-glutton](https://github.com/kermitt2/biblio-glutton) service or the [CrossRef REST API](https://github.com/CrossRef/rest-api-doc). In both cases, DOI/PMID resolution performance is higher than 0.95 F1-score from PDF extraction. - __Extraction and parsing of patent and non-patent references in patent__ publications. - __Extraction of Funders and funding information__ with optional matching of extracted funders with the CrossRef Funder Registry. +- __Identification of copyrights' owner and license associated to the document__, e.g. publisher or authors copyrights, CC-BY/CC-BY-NC/etc. license. -In a complete PDF processing, GROBID manages 55 final labels used to build relatively fine-grained structures, from traditional publication metadata (title, author first/last/middle names, affiliation types, detailed address, journal, volume, issue, pages, DOI, PMID, etc.) to full text structures (section title, paragraph, reference markers, head/foot notes, figure captions, etc.). +In a complete PDF processing, GROBID manages 68 final labels used to build relatively fine-grained structures, from traditional publication metadata (title, author first/last/middle names, affiliation types, detailed address, journal, volume, issue, pages, DOI, PMID, etc.) to full text structures (section title, paragraph, reference markers, head/foot notes, figure captions, etc.). GROBID includes a comprehensive [web service API](https://grobid.readthedocs.io/en/latest/Grobid-service/), [Docker images](https://grobid.readthedocs.io/en/latest/Grobid-docker/), [batch processing](https://grobid.readthedocs.io/en/latest/Grobid-batch/), a JAVA API, a generic [training and evaluation framework](https://grobid.readthedocs.io/en/latest/Training-the-models-of-Grobid/) (precision, recall, etc., n-fold cross-evaluation), systematic [end-to-end benchmarking](https://grobid.readthedocs.io/en/latest/Benchmarking/) on thousand documents and the semi-automatic generation of training data. @@ -108,7 +109,7 @@ A series of additional modules have been developed for performing __structure aw - [grobid-quantities](https://github.com/kermitt2/grobid-quantities): recognition and normalization of physical quantities/measurements - [grobid-superconductors](https://github.com/lfoppiano/grobid-superconductors): recognition of superconductor material and properties in scientific literature - [entity-fishing](https://github.com/kermitt2/entity-fishing), a tool for extracting Wikidata entities from text and document, which can also use Grobid to pre-process scientific articles in PDF, leading to more precise and relevant entity extraction and the capacity to annotate the PDF with interactive layout -- [dataseer-ml](https://github.com/dataseer/dataseer-ml): identification of sections and sentences introducing datasets in a scientific article, and classification of the type of these datasets +- [datastet](https://github.com/kermitt2/datastet): identification of sections and sentences introducing datasets in a scientific article, identification of dataset names (implict and named datasets) and classification of the type of these datasets - [grobid-ner](https://github.com/kermitt2/grobid-ner): named entity recognition - [grobid-astro](https://github.com/kermitt2/grobid-astro): recognition of astronomical entities in scientific papers - [grobid-bio](https://github.com/kermitt2/grobid-bio): a toy bio-entity tagger using BioNLP/NLPBA 2004 dataset @@ -143,7 +144,7 @@ If you want to cite this work, please refer to the present GitHub project, toget title = {GROBID}, howpublished = {\url{https://github.com/kermitt2/grobid}}, publisher = {GitHub}, - year = {2008--2023}, + year = {2008--2024}, archivePrefix = {swh}, eprint = {1:dir:dab86b296e3c3216e2241968f0d63b68e8209d3c} } diff --git a/grobid-core/src/main/java/org/grobid/core/data/CopyrightsLicense.java b/grobid-core/src/main/java/org/grobid/core/data/CopyrightsLicense.java index 56f3f36a92..9ea5f5331f 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/CopyrightsLicense.java +++ b/grobid-core/src/main/java/org/grobid/core/data/CopyrightsLicense.java @@ -39,7 +39,7 @@ public enum License { CCBYSA ("CC-BY-SA"), CCBYNCSA ("CC-BY-NC-SA"), CCBYND ("CC-BY-ND"), - COPYRIGHTS ("strict-copyrights"), + COPYRIGHT ("strict-copyrights"), OTHER ("other"), UNDECIDED ("undecided"); @@ -55,7 +55,7 @@ public String getName() { }; public static List licenses = - Arrays.asList("CC-0", "CC-BY", "CC-BY-NC", "CC-BY-NC-ND", "CC-BY-SA", "CC-BY-NC-SA", "CC-BY-ND", "copyrights", "other", "undecided"); + Arrays.asList("CC-0", "CC-BY", "CC-BY-NC", "CC-BY-NC-ND", "CC-BY-SA", "CC-BY-NC-SA", "CC-BY-ND", "copyright", "other", "undecided"); private CopyrightsOwner copyrightsOwner; private double copyrightsOwnerProb; diff --git a/grobid-core/src/main/java/org/grobid/core/engines/LicenseClassifier.java b/grobid-core/src/main/java/org/grobid/core/engines/LicenseClassifier.java index 29aec37733..1035057019 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/LicenseClassifier.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/LicenseClassifier.java @@ -120,7 +120,7 @@ protected static List extractResults(String copyrightOwnerAsJ double scoreUndecided = 0.0; int rank = 0; for (Double scoreField : scoreFields) { - if (scoreField>0.5 && scoreField >= bestProb) { + if (scoreField>0.5 && scoreField > bestProb) { owner = CopyrightsOwner.valueOf(owners.get(rank).toUpperCase()); bestProb = scoreField; } @@ -156,7 +156,7 @@ protected static List extractResults(String copyrightOwnerAsJ License license = null; rank = 0; for (Double scoreField : scoreFields) { - if (scoreField>0.5 && scoreField >= bestProb) { + if (scoreField>0.5 && scoreField > bestProb) { String valueLicense = licenses.get(rank); valueLicense = valueLicense.replace("-", ""); license = License.valueOf(valueLicense.toUpperCase());