Skip to content

Commit

Permalink
show names of input docs for found terms (configurable)
Browse files Browse the repository at this point in the history
  • Loading branch information
astrakhantsev committed May 18, 2017
1 parent 4ceaa8b commit 3d3bfee
Show file tree
Hide file tree
Showing 36 changed files with 95 additions and 46 deletions.
3 changes: 2 additions & 1 deletion configs/AvgTermFrequency.conf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"jsonClass" : "OneFeatureTCWeighterConfig",
"feature" : {
"jsonClass" : "AvgTermFrequency"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/Basic.conf
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"jsonClass" : "Basic",
"longerTermsCoeff" : 0.72,
"minSubTermSize" : 2
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/CValue.conf
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"feature" : {
"jsonClass" : "CValue",
"smoothing" : 0.1
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/ComboBasic.conf
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"jsonClass" : "ComboBasic",
"longerTermsCoeff" : 0.75,
"shorterTermsCoeff" : 0.1
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/DomainPertinence.conf
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
"epsilon" : 1.0E-20
},
"notFoundTermSmoothing" : 0.1
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/KeyConceptRelatedness.conf
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
"missSimValue" : 0.0
}
}
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/LinkProbability.conf
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"jsonClass" : "LinkProbability",
"threshold" : 0.018,
"fileName" : "./data/info-measure.txt"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/NovelTopicModel.conf
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
"rareWordsThreshold" : 1,
"randomSeed" : 13,
"topWordsForTopic" : 200
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/PU.conf
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@
"regParam" : 1.0E-8,
"elasticNetParam" : 0.0
}
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/PostRankDC.conf
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
"positive" : false
}
}
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/Relevance.conf
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
"fileName" : "./data/COHA_term_occurrences.txt",
"epsilon" : 1.0E-20
}
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/ResidualIDF.conf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"jsonClass" : "OneFeatureTCWeighterConfig",
"feature" : {
"jsonClass" : "ResidualIDF"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/TotalTFIDF.conf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"jsonClass" : "OneFeatureTCWeighterConfig",
"feature" : {
"jsonClass" : "TotalTFIDF"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/Voting.conf
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@
"missSimValue" : 0.0
}
}
} ]
} ],
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/Weirdness.conf
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
"fileName" : "./data/COHA_term_occurrences.txt",
"epsilon" : 0.001
}
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/AvgTermFrequency.conf
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@
"jsonClass" : "AvgTermFrequency"
},
"cacheDirName" : "features/"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/Basic.conf
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@
"minSubTermSize" : 2
},
"cacheDirName" : "features/"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/CValue.conf
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@
"smoothing" : 0.1
},
"cacheDirName" : "features/"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/ComboBasic.conf
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@
"shorterTermsCoeff" : 0.1
},
"cacheDirName" : "features/"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/DomainPertinence.conf
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@
"notFoundTermSmoothing" : 0.1
},
"cacheDirName" : "features/"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/KeyConceptRelatedness.conf
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@
}
},
"cacheDirName" : "features/"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/LinkProbability.conf
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@
"fileName" : "./data/info-measure.txt"
},
"cacheDirName" : "features/"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/NovelTopicModel.conf
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@
"topWordsForTopic" : 200
},
"cacheDirName" : "features/"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/PU.conf
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,7 @@
"regParam" : 1.0E-8,
"elasticNetParam" : 0.0
}
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/PostRankDC.conf
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@
}
},
"cacheDirName" : "features/"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/Relevance.conf
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@
}
},
"cacheDirName" : "features/"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/ResidualIDF.conf
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@
"jsonClass" : "ResidualIDF"
},
"cacheDirName" : "features/"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/TotalTFIDF.conf
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@
"jsonClass" : "TotalTFIDF"
},
"cacheDirName" : "features/"
}
},
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/Voting.conf
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,7 @@
}
},
"cacheDirName" : "features/"
} ]
} ],
"docsToShow" : 3
}
}
3 changes: 2 additions & 1 deletion configs/cached4acl2/Weirdness.conf
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@
}
},
"cacheDirName" : "features/"
}
},
"docsToShow" : 3
}
}
16 changes: 15 additions & 1 deletion src/main/scala/ru/ispras/atr/datamodel/TermCandidate.scala
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,21 @@ case class TermCandidate(occurrences: Seq[TermOccurrence]) {

def lemmas = occurrences.head.lemmas

def canonicalRepr = TermOccurrence.canonicalRepresentation(occurrences.head)
def canonicalRepr: String = TermOccurrence.canonicalRepresentation(occurrences.head)

def verboseRepr(docsToShow: Int): String = {
canonicalRepr + (if (docsToShow < 1) {
""
} else {
val docNames: Seq[String] = occurrences.map(_.docName).distinct
val docNamesStr = docNames.slice(0, docsToShow).mkString(",") + (if (docNames.size > docsToShow) {
"..."
} else {
""
})
s" [$docNamesStr]"
})
}

def lengthInWords: Int = occurrences.head.lemmas.size
}
8 changes: 4 additions & 4 deletions src/main/scala/ru/ispras/atr/rank/OneFeatureTCWeighter.scala
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import ru.ispras.atr.features.keyrel.{KeyConceptRelatedness, KeyConceptRelatedne
/**
* Simply ranks by the specified feature.
*/
class OneFeatureTCWeighter(feature: FeatureConfig) extends TermCandidatesWeighter {
class OneFeatureTCWeighter(feature: FeatureConfig, docsToShow: Int) extends TermCandidatesWeighter {

def weightAndSort(candidates: Seq[TermCandidate], dataset: DSDataset): Iterable[(String, Double)] = {
log.info(s"Initializing feature ${feature.id}...")
Expand All @@ -17,11 +17,11 @@ class OneFeatureTCWeighter(feature: FeatureConfig) extends TermCandidatesWeighte
//hack for computing number of candidates occurring in Wikipedia as concepts
// val keyRel = featureComputer.asInstanceOf[KeyConceptRelatednessFC]
// log.debug(s"hits: ${keyRel.word2VecAdapter.hits}; misses: ${keyRel.word2VecAdapter.misses}")
val res: Seq[(String, Double)] = candidates.map(_.canonicalRepr).zip(featureVals).sortBy(-_._2)
val res: Seq[(String, Double)] = candidates.map(_.verboseRepr(docsToShow)).zip(featureVals).sortBy(-_._2)
res
}
}

case class OneFeatureTCWeighterConfig(feature: FeatureConfig) extends TermCandidatesWeighterConfig {
override def build(): TermCandidatesWeighter = new OneFeatureTCWeighter(feature)
case class OneFeatureTCWeighterConfig(feature: FeatureConfig, docsToShow: Int = 3) extends TermCandidatesWeighterConfig {
override def build(): TermCandidatesWeighter = new OneFeatureTCWeighter(feature, docsToShow)
}
10 changes: 7 additions & 3 deletions src/main/scala/ru/ispras/atr/rank/PUTCWeighter.scala
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ import scala.collection.JavaConversions.asScalaBuffer
* @param seedsCount count of positives to be extracted
* @param predictFeatures features for PU learning algorithm
* @param puLearner configuration for PU learning algorithm
* @param docsToShow number of documents for term occurrences of found terms to show in the output file
*/
class PUTCWeighter(baseFeature: FeatureConfig,
seedsCount: Int,
predictFeatures: Seq[FeatureConfig],
puLearner: PositiveUnlabeledLearner) extends SparkTermCandidatesWeighter() {
puLearner: PositiveUnlabeledLearner,
docsToShow: Int) extends SparkTermCandidatesWeighter(docsToShow) {

val termProbName = "category"
val srcFeaturesName = "srcFeatures"
Expand Down Expand Up @@ -64,9 +66,11 @@ case class PUTCWeighterConfig
(baseFeature: FeatureConfig,
puTopCount: Int = 100,
predictFeatures: Seq[FeatureConfig],
puLearnerConfig: PositiveUnlabeledLearnerConfig) extends TermCandidatesWeighterConfig {
puLearnerConfig: PositiveUnlabeledLearnerConfig,
docsToShow: Int = 3
) extends TermCandidatesWeighterConfig {
override def build(): PUTCWeighter = {
new PUTCWeighter(baseFeature, puTopCount, predictFeatures, puLearnerConfig.build())
new PUTCWeighter(baseFeature, puTopCount, predictFeatures, puLearnerConfig.build(), docsToShow)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import ru.ispras.atr.features.FeatureConfig
*
* Note that it uses Spark, so initialization may take about 30 seconds.
*/
class SparkOneFeatureTCWeighter(feature: FeatureConfig) extends SparkTermCandidatesWeighter(){
class SparkOneFeatureTCWeighter(feature: FeatureConfig, docsToShow: Int) extends SparkTermCandidatesWeighter(docsToShow){

override def id: String = feature.id

Expand All @@ -18,6 +18,7 @@ class SparkOneFeatureTCWeighter(feature: FeatureConfig) extends SparkTermCandida
override def weight(df: DataFrame): DataFrame = df
}

case class SparkOneFeatureTCWeighterConfig(feature: FeatureConfig) extends TermCandidatesWeighterConfig {
override def build(): TermCandidatesWeighter = new SparkOneFeatureTCWeighter(feature)
case class SparkOneFeatureTCWeighterConfig(feature: FeatureConfig,
docsToShow: Int = 3) extends TermCandidatesWeighterConfig {
override def build(): TermCandidatesWeighter = new SparkOneFeatureTCWeighter(feature, docsToShow)
}
Loading

0 comments on commit 3d3bfee

Please sign in to comment.