diff --git a/configs/AvgTermFrequency.conf b/configs/AvgTermFrequency.conf index 6d1757f..4b339dc 100644 --- a/configs/AvgTermFrequency.conf +++ b/configs/AvgTermFrequency.conf @@ -29,6 +29,7 @@ "jsonClass" : "OneFeatureTCWeighterConfig", "feature" : { "jsonClass" : "AvgTermFrequency" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/Basic.conf b/configs/Basic.conf index d3d5ddd..8b7eea2 100644 --- a/configs/Basic.conf +++ b/configs/Basic.conf @@ -31,6 +31,7 @@ "jsonClass" : "Basic", "longerTermsCoeff" : 0.72, "minSubTermSize" : 2 - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/CValue.conf b/configs/CValue.conf index fe67dda..570e218 100644 --- a/configs/CValue.conf +++ b/configs/CValue.conf @@ -30,6 +30,7 @@ "feature" : { "jsonClass" : "CValue", "smoothing" : 0.1 - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/ComboBasic.conf b/configs/ComboBasic.conf index e0dcb68..5277b2e 100644 --- a/configs/ComboBasic.conf +++ b/configs/ComboBasic.conf @@ -31,6 +31,7 @@ "jsonClass" : "ComboBasic", "longerTermsCoeff" : 0.75, "shorterTermsCoeff" : 0.1 - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/DomainPertinence.conf b/configs/DomainPertinence.conf index 5c5ec6f..30081c7 100644 --- a/configs/DomainPertinence.conf +++ b/configs/DomainPertinence.conf @@ -35,6 +35,7 @@ "epsilon" : 1.0E-20 }, "notFoundTermSmoothing" : 0.1 - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/KeyConceptRelatedness.conf b/configs/KeyConceptRelatedness.conf index b1de362..cf3d2a8 100644 --- a/configs/KeyConceptRelatedness.conf +++ b/configs/KeyConceptRelatedness.conf @@ -65,6 +65,7 @@ "missSimValue" : 0.0 } } - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/LinkProbability.conf b/configs/LinkProbability.conf index 6070b57..9914226 100644 --- a/configs/LinkProbability.conf +++ b/configs/LinkProbability.conf @@ -31,6 +31,7 @@ "jsonClass" : "LinkProbability", "threshold" : 0.018, "fileName" : "./data/info-measure.txt" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/NovelTopicModel.conf b/configs/NovelTopicModel.conf index 06c3972..b252cfb 100644 --- a/configs/NovelTopicModel.conf +++ b/configs/NovelTopicModel.conf @@ -44,6 +44,7 @@ "rareWordsThreshold" : 1, "randomSeed" : 13, "topWordsForTopic" : 200 - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/PU.conf b/configs/PU.conf index f62fdb1..9f9a3fe 100644 --- a/configs/PU.conf +++ b/configs/PU.conf @@ -136,6 +136,7 @@ "regParam" : 1.0E-8, "elasticNetParam" : 0.0 } - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/PostRankDC.conf b/configs/PostRankDC.conf index 519851c..70e1126 100644 --- a/configs/PostRankDC.conf +++ b/configs/PostRankDC.conf @@ -58,6 +58,7 @@ "positive" : false } } - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/Relevance.conf b/configs/Relevance.conf index 5b66fab..8eecf81 100644 --- a/configs/Relevance.conf +++ b/configs/Relevance.conf @@ -34,6 +34,7 @@ "fileName" : "./data/COHA_term_occurrences.txt", "epsilon" : 1.0E-20 } - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/ResidualIDF.conf b/configs/ResidualIDF.conf index 8e3506f..0469f80 100644 --- a/configs/ResidualIDF.conf +++ b/configs/ResidualIDF.conf @@ -29,6 +29,7 @@ "jsonClass" : "OneFeatureTCWeighterConfig", "feature" : { "jsonClass" : "ResidualIDF" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/TotalTFIDF.conf b/configs/TotalTFIDF.conf index c77d3fb..6ece4dc 100644 --- a/configs/TotalTFIDF.conf +++ b/configs/TotalTFIDF.conf @@ -29,6 +29,7 @@ "jsonClass" : "OneFeatureTCWeighterConfig", "feature" : { "jsonClass" : "TotalTFIDF" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/Voting.conf b/configs/Voting.conf index 4b59107..77d52af 100644 --- a/configs/Voting.conf +++ b/configs/Voting.conf @@ -119,6 +119,7 @@ "missSimValue" : 0.0 } } - } ] + } ], + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/Weirdness.conf b/configs/Weirdness.conf index cfe6503..556caaa 100644 --- a/configs/Weirdness.conf +++ b/configs/Weirdness.conf @@ -34,6 +34,7 @@ "fileName" : "./data/COHA_term_occurrences.txt", "epsilon" : 0.001 } - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/AvgTermFrequency.conf b/configs/cached4acl2/AvgTermFrequency.conf index 82f156a..3418aa6 100644 --- a/configs/cached4acl2/AvgTermFrequency.conf +++ b/configs/cached4acl2/AvgTermFrequency.conf @@ -142,6 +142,7 @@ "jsonClass" : "AvgTermFrequency" }, "cacheDirName" : "features/" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/Basic.conf b/configs/cached4acl2/Basic.conf index 724eb23..65e2185 100644 --- a/configs/cached4acl2/Basic.conf +++ b/configs/cached4acl2/Basic.conf @@ -144,6 +144,7 @@ "minSubTermSize" : 2 }, "cacheDirName" : "features/" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/CValue.conf b/configs/cached4acl2/CValue.conf index 7ab05a8..490d8b3 100644 --- a/configs/cached4acl2/CValue.conf +++ b/configs/cached4acl2/CValue.conf @@ -143,6 +143,7 @@ "smoothing" : 0.1 }, "cacheDirName" : "features/" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/ComboBasic.conf b/configs/cached4acl2/ComboBasic.conf index f561cd3..4108eca 100644 --- a/configs/cached4acl2/ComboBasic.conf +++ b/configs/cached4acl2/ComboBasic.conf @@ -144,6 +144,7 @@ "shorterTermsCoeff" : 0.1 }, "cacheDirName" : "features/" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/DomainPertinence.conf b/configs/cached4acl2/DomainPertinence.conf index 8c274dc..320222c 100644 --- a/configs/cached4acl2/DomainPertinence.conf +++ b/configs/cached4acl2/DomainPertinence.conf @@ -148,6 +148,7 @@ "notFoundTermSmoothing" : 0.1 }, "cacheDirName" : "features/" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/KeyConceptRelatedness.conf b/configs/cached4acl2/KeyConceptRelatedness.conf index c43864d..5eccab3 100644 --- a/configs/cached4acl2/KeyConceptRelatedness.conf +++ b/configs/cached4acl2/KeyConceptRelatedness.conf @@ -178,6 +178,7 @@ } }, "cacheDirName" : "features/" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/LinkProbability.conf b/configs/cached4acl2/LinkProbability.conf index 89f363b..947d770 100644 --- a/configs/cached4acl2/LinkProbability.conf +++ b/configs/cached4acl2/LinkProbability.conf @@ -144,6 +144,7 @@ "fileName" : "./data/info-measure.txt" }, "cacheDirName" : "features/" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/NovelTopicModel.conf b/configs/cached4acl2/NovelTopicModel.conf index 1e75395..b671e40 100644 --- a/configs/cached4acl2/NovelTopicModel.conf +++ b/configs/cached4acl2/NovelTopicModel.conf @@ -157,6 +157,7 @@ "topWordsForTopic" : 200 }, "cacheDirName" : "features/" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/PU.conf b/configs/cached4acl2/PU.conf index 497c15f..c2a75a9 100644 --- a/configs/cached4acl2/PU.conf +++ b/configs/cached4acl2/PU.conf @@ -624,6 +624,7 @@ "regParam" : 1.0E-8, "elasticNetParam" : 0.0 } - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/PostRankDC.conf b/configs/cached4acl2/PostRankDC.conf index 2a47400..2908435 100644 --- a/configs/cached4acl2/PostRankDC.conf +++ b/configs/cached4acl2/PostRankDC.conf @@ -171,6 +171,7 @@ } }, "cacheDirName" : "features/" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/Relevance.conf b/configs/cached4acl2/Relevance.conf index ca94e95..1dd697f 100644 --- a/configs/cached4acl2/Relevance.conf +++ b/configs/cached4acl2/Relevance.conf @@ -147,6 +147,7 @@ } }, "cacheDirName" : "features/" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/ResidualIDF.conf b/configs/cached4acl2/ResidualIDF.conf index 0b89de0..7e873fd 100644 --- a/configs/cached4acl2/ResidualIDF.conf +++ b/configs/cached4acl2/ResidualIDF.conf @@ -142,6 +142,7 @@ "jsonClass" : "ResidualIDF" }, "cacheDirName" : "features/" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/TotalTFIDF.conf b/configs/cached4acl2/TotalTFIDF.conf index 380b347..15ddefe 100644 --- a/configs/cached4acl2/TotalTFIDF.conf +++ b/configs/cached4acl2/TotalTFIDF.conf @@ -142,6 +142,7 @@ "jsonClass" : "TotalTFIDF" }, "cacheDirName" : "features/" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/Voting.conf b/configs/cached4acl2/Voting.conf index cbfb337..a3d50ba 100644 --- a/configs/cached4acl2/Voting.conf +++ b/configs/cached4acl2/Voting.conf @@ -607,6 +607,7 @@ } }, "cacheDirName" : "features/" - } ] + } ], + "docsToShow" : 3 } } \ No newline at end of file diff --git a/configs/cached4acl2/Weirdness.conf b/configs/cached4acl2/Weirdness.conf index b7e7a70..d180d3e 100644 --- a/configs/cached4acl2/Weirdness.conf +++ b/configs/cached4acl2/Weirdness.conf @@ -147,6 +147,7 @@ } }, "cacheDirName" : "features/" - } + }, + "docsToShow" : 3 } } \ No newline at end of file diff --git a/src/main/scala/ru/ispras/atr/datamodel/TermCandidate.scala b/src/main/scala/ru/ispras/atr/datamodel/TermCandidate.scala index d83a9b0..d3faf16 100644 --- a/src/main/scala/ru/ispras/atr/datamodel/TermCandidate.scala +++ b/src/main/scala/ru/ispras/atr/datamodel/TermCandidate.scala @@ -9,7 +9,21 @@ case class TermCandidate(occurrences: Seq[TermOccurrence]) { def lemmas = occurrences.head.lemmas - def canonicalRepr = TermOccurrence.canonicalRepresentation(occurrences.head) + def canonicalRepr: String = TermOccurrence.canonicalRepresentation(occurrences.head) + + def verboseRepr(docsToShow: Int): String = { + canonicalRepr + (if (docsToShow < 1) { + "" + } else { + val docNames: Seq[String] = occurrences.map(_.docName).distinct + val docNamesStr = docNames.slice(0, docsToShow).mkString(",") + (if (docNames.size > docsToShow) { + "..." + } else { + "" + }) + s" [$docNamesStr]" + }) + } def lengthInWords: Int = occurrences.head.lemmas.size } diff --git a/src/main/scala/ru/ispras/atr/rank/OneFeatureTCWeighter.scala b/src/main/scala/ru/ispras/atr/rank/OneFeatureTCWeighter.scala index 3c2725b..f1df7e2 100644 --- a/src/main/scala/ru/ispras/atr/rank/OneFeatureTCWeighter.scala +++ b/src/main/scala/ru/ispras/atr/rank/OneFeatureTCWeighter.scala @@ -7,7 +7,7 @@ import ru.ispras.atr.features.keyrel.{KeyConceptRelatedness, KeyConceptRelatedne /** * Simply ranks by the specified feature. */ -class OneFeatureTCWeighter(feature: FeatureConfig) extends TermCandidatesWeighter { +class OneFeatureTCWeighter(feature: FeatureConfig, docsToShow: Int) extends TermCandidatesWeighter { def weightAndSort(candidates: Seq[TermCandidate], dataset: DSDataset): Iterable[(String, Double)] = { log.info(s"Initializing feature ${feature.id}...") @@ -17,11 +17,11 @@ class OneFeatureTCWeighter(feature: FeatureConfig) extends TermCandidatesWeighte //hack for computing number of candidates occurring in Wikipedia as concepts // val keyRel = featureComputer.asInstanceOf[KeyConceptRelatednessFC] // log.debug(s"hits: ${keyRel.word2VecAdapter.hits}; misses: ${keyRel.word2VecAdapter.misses}") - val res: Seq[(String, Double)] = candidates.map(_.canonicalRepr).zip(featureVals).sortBy(-_._2) + val res: Seq[(String, Double)] = candidates.map(_.verboseRepr(docsToShow)).zip(featureVals).sortBy(-_._2) res } } -case class OneFeatureTCWeighterConfig(feature: FeatureConfig) extends TermCandidatesWeighterConfig { - override def build(): TermCandidatesWeighter = new OneFeatureTCWeighter(feature) +case class OneFeatureTCWeighterConfig(feature: FeatureConfig, docsToShow: Int = 3) extends TermCandidatesWeighterConfig { + override def build(): TermCandidatesWeighter = new OneFeatureTCWeighter(feature, docsToShow) } diff --git a/src/main/scala/ru/ispras/atr/rank/PUTCWeighter.scala b/src/main/scala/ru/ispras/atr/rank/PUTCWeighter.scala index c7124e6..c203b5b 100644 --- a/src/main/scala/ru/ispras/atr/rank/PUTCWeighter.scala +++ b/src/main/scala/ru/ispras/atr/rank/PUTCWeighter.scala @@ -27,11 +27,13 @@ import scala.collection.JavaConversions.asScalaBuffer * @param seedsCount count of positives to be extracted * @param predictFeatures features for PU learning algorithm * @param puLearner configuration for PU learning algorithm + * @param docsToShow number of documents for term occurrences of found terms to show in the output file */ class PUTCWeighter(baseFeature: FeatureConfig, seedsCount: Int, predictFeatures: Seq[FeatureConfig], - puLearner: PositiveUnlabeledLearner) extends SparkTermCandidatesWeighter() { + puLearner: PositiveUnlabeledLearner, + docsToShow: Int) extends SparkTermCandidatesWeighter(docsToShow) { val termProbName = "category" val srcFeaturesName = "srcFeatures" @@ -64,9 +66,11 @@ case class PUTCWeighterConfig (baseFeature: FeatureConfig, puTopCount: Int = 100, predictFeatures: Seq[FeatureConfig], - puLearnerConfig: PositiveUnlabeledLearnerConfig) extends TermCandidatesWeighterConfig { + puLearnerConfig: PositiveUnlabeledLearnerConfig, + docsToShow: Int = 3 + ) extends TermCandidatesWeighterConfig { override def build(): PUTCWeighter = { - new PUTCWeighter(baseFeature, puTopCount, predictFeatures, puLearnerConfig.build()) + new PUTCWeighter(baseFeature, puTopCount, predictFeatures, puLearnerConfig.build(), docsToShow) } } diff --git a/src/main/scala/ru/ispras/atr/rank/SparkOneFeatureTCWeighter.scala b/src/main/scala/ru/ispras/atr/rank/SparkOneFeatureTCWeighter.scala index 814bea9..98285ae 100644 --- a/src/main/scala/ru/ispras/atr/rank/SparkOneFeatureTCWeighter.scala +++ b/src/main/scala/ru/ispras/atr/rank/SparkOneFeatureTCWeighter.scala @@ -8,7 +8,7 @@ import ru.ispras.atr.features.FeatureConfig * * Note that it uses Spark, so initialization may take about 30 seconds. */ -class SparkOneFeatureTCWeighter(feature: FeatureConfig) extends SparkTermCandidatesWeighter(){ +class SparkOneFeatureTCWeighter(feature: FeatureConfig, docsToShow: Int) extends SparkTermCandidatesWeighter(docsToShow){ override def id: String = feature.id @@ -18,6 +18,7 @@ class SparkOneFeatureTCWeighter(feature: FeatureConfig) extends SparkTermCandida override def weight(df: DataFrame): DataFrame = df } -case class SparkOneFeatureTCWeighterConfig(feature: FeatureConfig) extends TermCandidatesWeighterConfig { - override def build(): TermCandidatesWeighter = new SparkOneFeatureTCWeighter(feature) +case class SparkOneFeatureTCWeighterConfig(feature: FeatureConfig, + docsToShow: Int = 3) extends TermCandidatesWeighterConfig { + override def build(): TermCandidatesWeighter = new SparkOneFeatureTCWeighter(feature, docsToShow) } \ No newline at end of file diff --git a/src/main/scala/ru/ispras/atr/rank/SparkTermCandidatesWeighter.scala b/src/main/scala/ru/ispras/atr/rank/SparkTermCandidatesWeighter.scala index 46798a1..387d2f9 100644 --- a/src/main/scala/ru/ispras/atr/rank/SparkTermCandidatesWeighter.scala +++ b/src/main/scala/ru/ispras/atr/rank/SparkTermCandidatesWeighter.scala @@ -13,7 +13,7 @@ import ru.ispras.atr.features.FeatureConfig * Creates Spark Dataframe for term candidates and their scores (feature values), * then appends new column containing estimation of term probability for each candidate */ -abstract class SparkTermCandidatesWeighter() extends TermCandidatesWeighter { +abstract class SparkTermCandidatesWeighter(docsToShow:Int) extends TermCandidatesWeighter { val termDFName = "Term" def allFeatures: Seq[FeatureConfig] @@ -41,7 +41,7 @@ abstract class SparkTermCandidatesWeighter() extends TermCandidatesWeighter { def weightAndSort(candidates: Seq[TermCandidate], dataset: DSDataset): Iterable[(String, Double)] = { val featureValues = convert2FeatureSpace(candidates, dataset) - val initDF = convertToDF(candidates.map(_.canonicalRepr), allFeatures.map(_.id), featureValues) + val initDF = convertToDF(candidates.map(_.verboseRepr(docsToShow)), allFeatures.map(_.id), featureValues) val weightedDF = weight(initDF) val termNamesDF = weightedDF.select(termDFName,id).sort(desc(id)) val weightColId: String = id //for serialization diff --git a/src/main/scala/ru/ispras/atr/rank/VotingTCWeighter.scala b/src/main/scala/ru/ispras/atr/rank/VotingTCWeighter.scala index 40012d3..f76bce1 100644 --- a/src/main/scala/ru/ispras/atr/rank/VotingTCWeighter.scala +++ b/src/main/scala/ru/ispras/atr/rank/VotingTCWeighter.scala @@ -15,7 +15,7 @@ import scala.collection.JavaConversions.asScalaBuffer * Zhang, Z., Iria, J., Brewster, C., & Ciravegna, F. (2008, May). * A comparative evaluation of term recognition algorithms. In LREC. */ -class VotingTCWeighter(features: Seq[FeatureConfig]) extends SparkTermCandidatesWeighter() { +class VotingTCWeighter(features: Seq[FeatureConfig], docsToShow: Int) extends SparkTermCandidatesWeighter(docsToShow) { override def allFeatures: Seq[FeatureConfig] = features /** @@ -51,8 +51,8 @@ object VotingTCWeighter { val inverseUDF = udf(inverse(_:Double)) } -case class VotingTCWeighterConfig(features: Seq[FeatureConfig]) extends TermCandidatesWeighterConfig { - override def build(): TermCandidatesWeighter = new VotingTCWeighter(features) +case class VotingTCWeighterConfig(features: Seq[FeatureConfig], docsToShow: Int = 3) extends TermCandidatesWeighterConfig { + override def build(): TermCandidatesWeighter = new VotingTCWeighter(features, docsToShow) } object VotingTCWeighterConfig {