-
Notifications
You must be signed in to change notification settings - Fork 461
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
start integrating copyright and license model and classes
- Loading branch information
Showing
8 changed files
with
400 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
96 changes: 96 additions & 0 deletions
96
grobid-core/src/main/java/org/grobid/core/data/CopyrightsLicense.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
package org.grobid.core.data; | ||
|
||
import org.grobid.core.utilities.TextUtilities; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Arrays; | ||
|
||
/** | ||
* Class for representing information related to copyrights owner and file license. | ||
*/ | ||
public class CopyrightsLicense { | ||
|
||
// copyrights owner | ||
public enum CopyrightsOwner { | ||
PUBLISHER ("publisher"), | ||
AUTHORS ("authors"), | ||
UNDECIDED ("undecided"); | ||
|
||
private String name; | ||
|
||
private CopyrightsOwner(String name) { | ||
this.name = name; | ||
} | ||
|
||
public String getName() { | ||
return name; | ||
} | ||
}; | ||
|
||
public static List<String> copyrightOwners = Arrays.asList("publisher", "authors", "undecided"); | ||
|
||
// File-level licenses | ||
public enum License { | ||
CC0 ("CC-0"), | ||
CCBY ("CC-BY"), | ||
CCBYNC ("CC-BY-NC"), | ||
CCBYNCND ("CC-BY-NC-ND"), | ||
CCBYSA ("CC-BY-SA"), | ||
CCBYNCSA ("CC-BY-NC-SA"), | ||
CCBYND ("CC-BY-ND"), | ||
COPYRIGHTS_STRICT ("copyrights"), | ||
OTHER ("other"), | ||
UNDECIDED ("undecided"); | ||
|
||
private String name; | ||
|
||
private License(String name) { | ||
this.name = name; | ||
} | ||
|
||
public String getName() { | ||
return name; | ||
} | ||
}; | ||
|
||
public static List<String> licenses = | ||
Arrays.asList("CC-0", "CC-BY", "CC-BY-NC", "CC-BY-NC-ND", "CC-BY-SA", "CC-BY-NC-SA", "CC-BY-ND", "copyrights", "other", "undecided"); | ||
|
||
private CopyrightsOwner copyrightsOwner; | ||
private double copyrightsOwnerProb; | ||
private License license; | ||
private double licenseProb; | ||
|
||
public CopyrightsOwner getCopyrightsOwner() { | ||
return this.copyrightsOwner; | ||
} | ||
|
||
public void setCopyrightsOwner(CopyrightsOwner owner) { | ||
this.copyrightsOwner = owner; | ||
} | ||
|
||
public double getCopyrightsOwnerProb() { | ||
return this.copyrightsOwnerProb; | ||
} | ||
|
||
public void setCopyrightsOwnerProb(double prob) { | ||
this.copyrightsOwnerProb = prob; | ||
} | ||
|
||
public License getLicense() { | ||
return this.license; | ||
} | ||
|
||
public void setLicense(License license) { | ||
this.license = license; | ||
} | ||
|
||
public double getLicenseProb() { | ||
return this.licenseProb; | ||
} | ||
|
||
public void setLicenseProb(double prob) { | ||
this.licenseProb = prob; | ||
} | ||
} |
236 changes: 236 additions & 0 deletions
236
grobid-core/src/main/java/org/grobid/core/engines/LicenseClassifier.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
package org.grobid.core.engines; | ||
|
||
import java.util.*; | ||
import org.apache.commons.lang3.StringUtils; | ||
|
||
import org.grobid.core.data.CopyrightsLicense; | ||
import org.grobid.core.data.CopyrightsLicense.CopyrightsOwner; | ||
import org.grobid.core.data.CopyrightsLicense.License; | ||
import org.grobid.core.exceptions.GrobidException; | ||
import org.grobid.core.utilities.TextUtilities; | ||
import org.grobid.core.utilities.UnicodeUtil; | ||
import org.grobid.core.utilities.GrobidProperties; | ||
import org.grobid.core.GrobidModels; | ||
import org.grobid.core.factory.GrobidFactory; | ||
import org.grobid.core.jni.PythonEnvironmentConfig; | ||
import org.grobid.core.jni.DeLFTClassifierModel; | ||
import org.grobid.core.utilities.GrobidConfig.ModelParameters; | ||
|
||
import com.fasterxml.jackson.core.*; | ||
import com.fasterxml.jackson.databind.*; | ||
import com.fasterxml.jackson.databind.node.*; | ||
import com.fasterxml.jackson.annotation.*; | ||
import com.fasterxml.jackson.core.io.*; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
public class LicenseClassifier { | ||
|
||
private static final Logger LOGGER = LoggerFactory.getLogger(LicenseClassifier.class); | ||
|
||
// multi-class/multi-label classifier | ||
private DeLFTClassifierModel classifierCopyrightsOwner = null; | ||
private DeLFTClassifierModel classifierLicense = null; | ||
|
||
// binary classifiers to be added if used | ||
|
||
private Boolean useBinary = false; | ||
|
||
private JsonParser parser; | ||
|
||
private static volatile LicenseClassifier instance; | ||
|
||
public static LicenseClassifier getInstance() { | ||
if (instance == null) { | ||
getNewInstance(); | ||
} | ||
return instance; | ||
} | ||
|
||
/** | ||
* Create a new instance. | ||
*/ | ||
private static synchronized void getNewInstance() { | ||
instance = new LicenseClassifier(); | ||
} | ||
|
||
private LicenseClassifier() { | ||
//ModelParameters parameterCopyrightsOwner = GrobidProperties.getModel("copyrights-owner"); | ||
//ModelParameters parameterLicenses = GrobidProperties.getModel("licenses"); | ||
|
||
//this.useBinary = configuration.getUseBinaryContextClassifiers(); | ||
if (this.useBinary == null) | ||
this.useBinary = false; | ||
|
||
this.classifierCopyrightsOwner = new DeLFTClassifierModel("copyright", GrobidProperties.getDelftArchitecture("copyright")); | ||
this.classifierLicense = new DeLFTClassifierModel("license", GrobidProperties.getDelftArchitecture("license")); | ||
} | ||
|
||
/** | ||
* Classify a simple piece of text | ||
* @return list of predicted labels/scores pairs | ||
*/ | ||
public CopyrightsLicense classify(String text) throws Exception { | ||
if (StringUtils.isEmpty(text)) | ||
return null; | ||
List<String> texts = new ArrayList<String>(); | ||
texts.add(text); | ||
return classify(texts).get(0); | ||
} | ||
|
||
/** | ||
* Classify an array of texts | ||
* @return list of predicted labels/scores pairs for each text | ||
*/ | ||
public List<CopyrightsLicense> classify(List<String> texts) throws Exception { | ||
if (texts == null || texts.size() == 0) | ||
return null; | ||
|
||
LOGGER.info("classify: " + texts.size()); | ||
|
||
String the_json_copyrights_owner = this.classifierCopyrightsOwner.classify(texts); | ||
String the_json_licenses = this.classifierLicense.classify(texts); | ||
|
||
List<CopyrightsLicense> results = new ArrayList<>(); | ||
|
||
// set resulting context classes to entity mentions | ||
try { | ||
ObjectMapper mapper = new ObjectMapper(); | ||
JsonNode root_copyrights = mapper.readTree(the_json_copyrights_owner); | ||
JsonNode root_licenses = mapper.readTree(the_json_licenses); | ||
|
||
int entityRank =0; | ||
JsonNode classificationsNodeCopyrights = root_copyrights.findPath("classifications"); | ||
JsonNode classificationsNodeLicenses = root_licenses.findPath("classifications"); | ||
if ((classificationsNodeCopyrights != null) && (!classificationsNodeCopyrights.isMissingNode()) && | ||
(classificationsNodeLicenses != null) && (!classificationsNodeLicenses.isMissingNode())) { | ||
Iterator<JsonNode> ite1 = classificationsNodeCopyrights.elements(); | ||
Iterator<JsonNode> ite2 = classificationsNodeLicenses.elements(); | ||
while (ite1.hasNext()) { | ||
CopyrightsLicense result = new CopyrightsLicense(); | ||
JsonNode classificationsNode = ite1.next(); | ||
|
||
List<String> owners = CopyrightsLicense.copyrightOwners; | ||
List<Double> scoreFields = new ArrayList<>(); | ||
|
||
for(String fieldOwners : owners) { | ||
JsonNode fieldNode = classificationsNode.findPath(fieldOwners); | ||
double scoreField = 0.0; | ||
if ((fieldNode != null) && (!fieldNode.isMissingNode())) { | ||
scoreFields.add(fieldNode.doubleValue()); | ||
} | ||
} | ||
|
||
CopyrightsOwner owner = null; | ||
double bestProb = 0.0; | ||
double scoreUndecided = 0.0; | ||
int rank = 0; | ||
for (Double scoreField : scoreFields) { | ||
if (scoreField>0.5 && scoreField >= bestProb) { | ||
owner = CopyrightsOwner.valueOf(owners.get(rank)); | ||
bestProb = scoreField; | ||
} | ||
scoreUndecided = scoreField; | ||
rank++; | ||
} | ||
|
||
if (owner == null) { | ||
owner = CopyrightsOwner.UNDECIDED; | ||
bestProb = scoreUndecided; | ||
} | ||
|
||
/*JsonNode publisherNode = classificationsNodeCopyrights.findPath("publisher"); | ||
JsonNode authorsNode = classificationsNodeCopyrights.findPath("authors"); | ||
JsonNode undecideNode = classificationsNodeCopyrights.findPath("undecided"); | ||
JsonNode textNode = classificationsNodeCopyrights.findPath("text"); | ||
double scorePublisher = 0.0; | ||
if ((publisherNode != null) && (!publisherNode.isMissingNode())) { | ||
scorePublisher = publisherNode.doubleValue(); | ||
} | ||
double scoreAuthors = 0.0; | ||
if ((authorsNode != null) && (!authorsNode.isMissingNode())) { | ||
scoreAuthors = authorsNode.doubleValue(); | ||
} | ||
double scoreUndecided = 0.0; | ||
if ((undecideNode != null) && (!undecideNode.isMissingNode())) { | ||
scoreUndecided = undecideNode.doubleValue(); | ||
} | ||
String textValue = null; | ||
if ((textNode != null) && (!textNode.isMissingNode())) { | ||
textValue = textNode.textValue(); | ||
} | ||
CopyrightsOwner owner = null; | ||
double bestProb = 0.0; | ||
if (scorePublisher>0.5) { | ||
owner = CopyrightsOwner.PUBLISHER; | ||
bestProb = scorePublisher; | ||
} | ||
if (scoreAuthors > 0.5 && scoreAuthors >= scorePublisher) { | ||
owner = CopyrightsOwner.AUTHORS; | ||
bestProb = scoreAuthors; | ||
} | ||
if (scoreUndecided > bestProb) { | ||
owner = CopyrightsOwner.UNDECIDED; | ||
bestProb = scoreUndecided; | ||
}*/ | ||
|
||
// ser best copyright owner with prob | ||
result.setCopyrightsOwner(owner); | ||
result.setCopyrightsOwnerProb(bestProb); | ||
|
||
classificationsNode = ite2.next(); | ||
|
||
bestProb = 0.0; | ||
List<String> licenses = CopyrightsLicense.licenses; | ||
scoreFields = new ArrayList<>(); | ||
|
||
for(String fieldLicenses : licenses) { | ||
JsonNode fieldNode = classificationsNode.findPath(fieldLicenses); | ||
double scoreField = 0.0; | ||
if ((fieldNode != null) && (!fieldNode.isMissingNode())) { | ||
scoreFields.add(fieldNode.doubleValue()); | ||
} | ||
} | ||
|
||
bestProb = 0.0; | ||
scoreUndecided = 0.0; | ||
License license = null; | ||
rank = 0; | ||
for (Double scoreField : scoreFields) { | ||
if (scoreField>0.5 && scoreField >= bestProb) { | ||
license = License.valueOf(licenses.get(rank)); | ||
bestProb = scoreField; | ||
} | ||
scoreUndecided = scoreField; | ||
rank++; | ||
} | ||
|
||
if (license == null) { | ||
license = License.UNDECIDED; | ||
bestProb = scoreUndecided; | ||
} | ||
|
||
// get best license with prob | ||
result.setLicense(license); | ||
result.setLicenseProb(bestProb); | ||
|
||
results.add(result); | ||
entityRank++; | ||
} | ||
} | ||
} catch(JsonProcessingException e) { | ||
LOGGER.error("failed to parse JSON copyrights/licenses classification result", e); | ||
} | ||
|
||
return results; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
{ | ||
"model_name": "copyright_gru", | ||
"architecture": "gru", | ||
"embeddings_name": "glove-840B", | ||
"char_embedding_size": 25, | ||
"word_embedding_size": 300, | ||
"dropout": 0.5, | ||
"recurrent_dropout": 0.25, | ||
"maxlen": 300, | ||
"dense_size": 32, | ||
"use_char_feature": false, | ||
"list_classes": [ | ||
"publisher", | ||
"authors", | ||
"undecided" | ||
], | ||
"fold_number": 1, | ||
"batch_size": 256, | ||
"transformer_name": null | ||
} |
Binary file not shown.
Oops, something went wrong.