Skip to content

Commit

Permalink
Working on similarity
Browse files Browse the repository at this point in the history
  • Loading branch information
Rafael C. Carrasco committed Jul 22, 2014
1 parent dd86192 commit cf7985e
Show file tree
Hide file tree
Showing 7 changed files with 2,170 additions and 13 deletions.
2,110 changes: 2,110 additions & 0 deletions output/output.csv

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions src/main/java/com/cervantesvirtual/MARCauthority/Creator.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import com.cervantesvirtual.dates.Period;
import com.cervantesvirtual.metadata.MARCDataField;
import com.cervantesvirtual.metadata.MARCSubfield;
import java.util.Arrays;

/**
* Stores data about a creator.
Expand All @@ -28,6 +29,7 @@ public class Creator extends MARCDataField {
private CyclicArray<String> normalizedVariants(String name) {
// String[] tokens = normal.split("\\p{Space}+");
String[] tokens = Normalizer.removeStopwords(name.trim()).split(",");
System.out.println(name+"->"+Arrays.toString(tokens));
for (int n = 0; n < tokens.length; ++n) {
tokens[n] = Normalizer.normalize(tokens[n]);
}
Expand Down Expand Up @@ -114,6 +116,7 @@ public int nameHashCode() {
public boolean compatible(Creator other) {
boolean b = this.title == null || other.title == null
|| this.title.equals(other.title);

return this.variants.equals(other.variants) && b
&& this.period.compatible(other.period);
}
Expand Down
19 changes: 10 additions & 9 deletions src/main/java/com/cervantesvirtual/MARCauthority/CreatorSet.java
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
package com.cervantesvirtual.MARCauthority;

import java.io.PrintWriter;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import com.cervantesvirtual.metadata.FieldType;
import com.cervantesvirtual.io.Messages;
import com.cervantesvirtual.metadata.Collection;
import com.cervantesvirtual.metadata.Record;
import com.cervantesvirtual.metadata.Field;
import com.cervantesvirtual.metadata.FieldType;
import com.cervantesvirtual.metadata.MARCDataField;
import com.cervantesvirtual.metadata.Record;
import com.cervantesvirtual.util.MultiTreeMap;
import com.cervantesvirtual.util.StringFinder;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
* A set storing all creators and variants (extended MARCDataFields).
Expand Down Expand Up @@ -109,8 +109,9 @@ public void printSimilar(PrintWriter writer) {

for (Creator creator : this) {
String name = creator.getFullName();
//writer.println(name);
Set<String> alternatives = finder.select(name, name.length() / 4);

//Messages.info(name);
for (String alternative : alternatives) {
// System.out.println(name + " * " + alternative);
for (Creator altcreator : creators.get(alternative)) {
Expand Down
4 changes: 3 additions & 1 deletion src/main/java/com/cervantesvirtual/metadata/Collection.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.cervantesvirtual.metadata;

import com.cervantesvirtual.io.CSVReader;
import com.cervantesvirtual.io.Messages;
import com.cervantesvirtual.xml.DocumentParser;
import java.io.BufferedReader;
import java.io.File;
Expand Down Expand Up @@ -78,11 +79,12 @@ public Collection(MetadataFormat format, File file)

this.format = format;
this.records = new ArrayList<Record>();

if (file.isDirectory()) {
for(File subfile: file.listFiles()) {
add(format, subfile);
}
Messages.info("Read all files in " + file.getName());
} else {
add(format, file);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package com.cervantesvirtual.metadata.marc;
package com.cervantesvirtual.MARCauthority;


import junit.framework.TestCase;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package com.cervantesvirtual.metadata.marc;
package com.cervantesvirtual.MARCauthority;

import com.cervantesvirtual.MARCauthority.CreatorSet;
import com.cervantesvirtual.metadata.Collection;
import com.cervantesvirtual.metadata.MetadataFormat;
import java.io.File;
Expand All @@ -20,4 +19,5 @@ public void testCreatorSet() {
System.out.println(set);
assertEquals(2, set.size());
}

}
41 changes: 41 additions & 0 deletions src/test/java/com/cervantesvirtual/MARCauthority/CreatorTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/

package com.cervantesvirtual.MARCauthority;

import junit.framework.TestCase;

/**
*
* @author rafa
*/
public class CreatorTest extends TestCase {

/**
* Test of compatible method, of class Creator.
*/
public void testCompatible() {
System.out.println("compatible");
Creator instance = new Creator("Paganini, Paganinus de,");
Creator other = new Creator("Paganinus de Paganini");
assert(instance.compatible(other));

}

/**
* Test of similarity method, of class Creator.
*/
public void testSimilarity() {
System.out.println("similarity");

Creator creator1 = new Creator("Paganinis, Paganinus de,");
Creator creator2 = new Creator("Paganini, Paganinus de");
double expResult = 1.0/3;
double result = creator1.similarity(creator2);
assertEquals(expResult, result, 0.01);
}

}

0 comments on commit cf7985e

Please sign in to comment.