diff --git a/src/RCode/IDEA-figures/discrtization_approximation.R b/src/RCode/IDEA-figures/discrtization_approximation.R index 96caae1..8346f6e 100644 --- a/src/RCode/IDEA-figures/discrtization_approximation.R +++ b/src/RCode/IDEA-figures/discrtization_approximation.R @@ -41,11 +41,14 @@ unique(df_common$dataset) # df = select(filter(df_common, dataset == "ecg0606"), algorithm.x, frequency.x, cover.x) setnames(df, c("algorithm.y","frequency.y","cover.y")) -df = rbind(df, select(df_common,algorithm.y,frequency.y)) -setnames(df, c("algorithm","frequency")) -ggplot(df[df$frequency<50,], aes(x = frequency, fill=algorithm)) + geom_density(alpha=0.5) - +df = rbind(df, select(df_common,algorithm.y,frequency.y,cover.y)) +setnames(df, c("algorithm","frequency","cover")) +ggplot(df[df$cover>0.98 & df$frequency < 100,], aes(x = frequency, fill=algorithm)) + + geom_density(alpha=0.5, binwidth=1) + geom_vline(xintercept=14, col="red") + theme_bw() + + ggtitle(paste("Estimated kernel densities for the most frequent rule occurrence\n", + "when the total cover above 0.98")) + df = select(filter(df_common, dataset=="stdb_308"), algorithm.x, frequency.x) setnames(df, c("algorithm.y","frequency.y")) df = rbind(df, select(df_common,algorithm.y,frequency.y)) diff --git a/src/main/java/net/seninp/gi/logic/GIUtils.java b/src/main/java/net/seninp/gi/logic/GIUtils.java index e37ca7c..fe1d416 100644 --- a/src/main/java/net/seninp/gi/logic/GIUtils.java +++ b/src/main/java/net/seninp/gi/logic/GIUtils.java @@ -146,4 +146,72 @@ public static double getMeanRuleCoverage(int length, GrammarRules rules) { return (double) coverageSum / (double) length; } + /** + * Computes which fraction of the time series is covered by the rules set. + * + * @param seriesLength the time series length. + * @param rules the grammar rules set. + * @return a fraction covered by the rules. + */ + public static double getCoverAsFraction(int seriesLength, ArrayList refinedClassifiedMotifs) { + + boolean[] coverageArray = new boolean[seriesLength]; + + for (SameLengthMotifs rule : refinedClassifiedMotifs) { + for(SAXMotif motif : rule.getSameLenMotifs()){ + RuleInterval saxPos = motif.getPos(); + int startPos = saxPos.getStart(); + int endPos = saxPos.getEnd(); + for (int j = startPos; j < endPos; j++) { + coverageArray[j] = true; + } + } + } + + int coverSum = 0; + for (int i = 0; i < seriesLength; i++) { + if (coverageArray[i]) { + coverSum++; + } + } + return (double) coverSum / (double) seriesLength; + } + + /** + * Gets the mean rule coverage. + * + * @param length the original time-series length. + * @param rules the grammar rules set. + * @return + */ + public static double getMeanRuleCoverage(int length, ArrayList refinedClassifiedMotifs) { + // get the coverage array + // + int[] coverageArray = new int[length]; + for (SameLengthMotifs rule : refinedClassifiedMotifs) { + for(SAXMotif motif : rule.getSameLenMotifs()){ + + RuleInterval saxPos = motif.getPos(); + int startPos = saxPos.getStart(); + int endPos = saxPos.getEnd(); + for (int j = startPos; j < endPos; j++) { + coverageArray[j] = coverageArray[j] + 1; + } + } + } + int minCoverage = 0; + int maxCoverage = 0; + int coverageSum = 0; + for (int i : coverageArray) { + coverageSum += i; + if (i < minCoverage) { + minCoverage = i; + } + if (i > maxCoverage) { + maxCoverage = i; + } + } + return (double) coverageSum / (double) length; + } + } diff --git a/src/main/java/net/seninp/gi/tinker/EvaluatorClusterRule.java b/src/main/java/net/seninp/gi/tinker/EvaluatorClusterRule.java index 1d05a12..5c93d81 100644 --- a/src/main/java/net/seninp/gi/tinker/EvaluatorClusterRule.java +++ b/src/main/java/net/seninp/gi/tinker/EvaluatorClusterRule.java @@ -24,116 +24,123 @@ public class EvaluatorClusterRule { - private static final String[] DATASETS = { "ann_gun_CentroidA1", "chfdbchf15", - "dutch_power_demand", "ecg0606", "gps_track", "insect", "mitdbx_108", "nprs43", "nprs44", - "stdb_308", "TEK14", "TEK16", "TEK17", "winding_col", "300_signal1", "318_signal1" }; + private static final String[] DATASETS = { "ann_gun_CentroidA1", "chfdbchf15", "dutch_power_demand", "ecg0606", + "gps_track", "insect", "mitdbx_108", "nprs43", "nprs44", "stdb_308", "TEK14", "TEK16", "TEK17", "winding_col", + "300_signal1", "318_signal1" }; - private static final int[] WINDOWS = { 30, 50, 70, 90, 100, 110, 120, 130, 140, 160, 180, 200, - 220, 240, 260, 280, 300, 320, 330, 340, 350, 360, 380, 400, 420, 440, 460 }; + private static final int[] WINDOWS = { 30, 50, 70, 90, 100, 110, 120, 130, 140, 160, 180, 200, 220, 240, 260, 280, + 300, 320, 330, 340, 350, 360, 380, 400, 420, 440, 460 }; - private static final int[] WINDOWS_PD = { 480, 500, 520, 540, 560, 580, 600, 320, 640, 680, 700, - 720, 740, 760, 780, 800, 820, 840, 860, 880, 900 }; + private static final int[] WINDOWS_PD = { 480, 500, 520, 540, 560, 580, 600, 320, 640, 680, 700, 720, 740, 760, 780, + 800, 820, 840, 860, 880, 900 }; - private static final int[] PAAS = { 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }; - private static final int[] ALPHABETS = { 2, 3, 4, 5, 6, 7, 8, 9, 10, 12 }; + private static final int[] PAAS = { 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }; + private static final int[] ALPHABETS = { 2, 3, 4, 5, 6, 7, 8, 9, 10, 12 }; - private static final String TAB = "\t"; + private static final String TAB = "\t"; - private static final String CR = "\n"; + private static final String CR = "\n"; - private static TSProcessor tp = new TSProcessor(); - private static NormalAlphabet na = new NormalAlphabet(); - private static SAXProcessor sp = new SAXProcessor(); + private static TSProcessor tp = new TSProcessor(); + private static NormalAlphabet na = new NormalAlphabet(); + private static SAXProcessor sp = new SAXProcessor(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - double thresholdLength = 0.1; - double thresholdCom = 0.5; - double fractionTopDist = 0.67; - - String dataset = DATASETS[Integer.valueOf(args[0])]; + double thresholdLength = 0.1; + double thresholdCom = 0.5; + double fractionTopDist = 0.67; - System.out.println("Sampling " + dataset); + String dataset = DATASETS[Integer.valueOf(args[0])]; - BufferedWriter bw = new BufferedWriter( - new FileWriter(new File(dataset + "_repair_grammarsampler_clusterrule.out"))); - bw.write("dataset\twindow\tpaa\talphabet\tapproximation\t"); - bw.write("rules\tgr_size\tfrequency\tcover\tcoverage\t"); - bw.write("pruned_rules\tpruned_gr_size\tpruned_frequency\tpruned_cover\tpruned_coverage\n"); + System.out.println("Sampling " + dataset); - double[] series = tp.readTS("src/resources/test-data/" + dataset + ".txt", 0); + BufferedWriter bw = new BufferedWriter(new FileWriter(new File(dataset + "_repair_grammarsampler_clusterrule.out"))); + bw.write("dataset\twindow\tpaa\talphabet\tapproximation\t"); + bw.write("rules\tgr_size\tfrequency\tcover\tcoverage\t"); + bw.write("packed_rules\tpruned_gr_size\tpacked_frequency\tpruned_cover\tpacked_coverage\n"); - if ("300_signal1".equalsIgnoreCase(dataset) || "318_signal1".equalsIgnoreCase(dataset)) { - series = Arrays.copyOfRange(series, 0, 30000); - } + double[] series = tp.readTS("src/resources/test-data/" + dataset + ".txt", 0); - ArrayList wins = new ArrayList(); - for (int i : WINDOWS) { - wins.add(i); - } - if ("dutch_power_demand".equalsIgnoreCase(dataset)) { - for (int i : WINDOWS_PD) { - wins.add(i); - } - } + if ("300_signal1".equalsIgnoreCase(dataset) || "318_signal1".equalsIgnoreCase(dataset)) { + series = Arrays.copyOfRange(series, 0, 30000); + } - for (int w : wins) { - for (int p : PAAS) { - for (int a : ALPHABETS) { + ArrayList wins = new ArrayList(); + for (int i : WINDOWS) { + wins.add(i); + } + if ("dutch_power_demand".equalsIgnoreCase(dataset)) { + for (int i : WINDOWS_PD) { + wins.add(i); + } + } - SAXRecords saxData = sp.ts2saxViaWindow(series, w, p, na.getCuts(a), - NumerosityReductionStrategy.EXACT, 0.01); + for (int w : wins) { + for (int p : PAAS) { + for (int a : ALPHABETS) { - // sequitur section - // - String discretizedTS = saxData.getSAXString(" "); + SAXRecords saxData = sp.ts2saxViaWindow(series, w, p, na.getCuts(a), NumerosityReductionStrategy.EXACT, 0.01); - SAXRule grammar = SequiturFactory.runSequitur(discretizedTS); - GrammarRules rules = grammar.toGrammarRulesData(); - SequiturFactory.updateRuleIntervals(rules, saxData, true, series, w, p); + // sequitur section + // + String discretizedTS = saxData.getSAXString(" "); - ArrayList refinedClassifiedMotifs=ClusterRuleFactory.performPruning(series, rules, - thresholdLength, thresholdCom, fractionTopDist); - ArrayList packedRules = ClusterRuleFactory.getPackedRule(refinedClassifiedMotifs); + SAXRule grammar = SequiturFactory.runSequitur(discretizedTS); + GrammarRules rules = grammar.toGrammarRulesData(); + SequiturFactory.updateRuleIntervals(rules, saxData, true, series, w, p); - RuleOrganizer ro = new RuleOrganizer(); - SAXPointsNumber[] pointsOccurenceInPackedRule = ro.countPointNumberAfterRemoving(series, - refinedClassifiedMotifs); - + ArrayList refinedClassifiedMotifs = ClusterRuleFactory.performPruning(series, rules, + thresholdLength, thresholdCom, fractionTopDist); + ArrayList packedRules = ClusterRuleFactory.getPackedRule(refinedClassifiedMotifs); - StringBuilder sb = new StringBuilder(); + RuleOrganizer ro = new RuleOrganizer(); + SAXPointsNumber[] pointsOccurenceInPackedRule = ro.countPointNumberAfterRemoving(series, + refinedClassifiedMotifs); - sb.append(dataset).append(TAB); + StringBuilder sb = new StringBuilder(); - sb.append(w).append(TAB); - sb.append(p).append(TAB); - sb.append(a).append(TAB); - sb.append(sp.approximationDistancePAA(series, w, p, 0.01) - + sp.approximationDistanceAlphabet(series, w, p, a, 0.01)).append(TAB); + sb.append(dataset).append(TAB); - sb.append(rules.size()).append(TAB); - sb.append(RulePrunerFactory.computeGrammarSize(rules, p)).append(TAB); - sb.append(rules.getHighestFrequency()).append(TAB); - sb.append(GIUtils.getCoverAsFraction(series.length, rules)).append(TAB); - sb.append(GIUtils.getMeanRuleCoverage(series.length, rules)).append(TAB); + sb.append(w).append(TAB); + sb.append(p).append(TAB); + sb.append(a).append(TAB); + sb.append( + sp.approximationDistancePAA(series, w, p, 0.01) + sp.approximationDistanceAlphabet(series, w, p, a, 0.01)) + .append(TAB); - sb.append(packedRules.size()).append(TAB); - sb.append("none").append(TAB); - sb.append("none").append(TAB); - sb.append("none").append(TAB); - sb.append("none").append(CR); + sb.append(rules.size()).append(TAB); + sb.append(RulePrunerFactory.computeGrammarSize(rules, p)).append(TAB); + sb.append(rules.getHighestFrequency()).append(TAB); + sb.append(GIUtils.getCoverAsFraction(series.length, rules)).append(TAB); + sb.append(GIUtils.getMeanRuleCoverage(series.length, rules)).append(TAB); - System.out.print(sb.toString()); - bw.write(sb.toString()); + sb.append(packedRules.size()).append(TAB); + sb.append("none").append(TAB); + sb.append(getHighestFrequency(pointsOccurenceInPackedRule)).append(TAB); + sb.append(GIUtils.getCoverAsFraction(series.length, refinedClassifiedMotifs)).append(TAB); + sb.append(GIUtils.getMeanRuleCoverage(series.length, refinedClassifiedMotifs)).append(CR); - } - } - } + System.out.print(sb.toString()); + bw.write(sb.toString()); - bw.close(); + } + } + } - } + bw.close(); + } + public static int getHighestFrequency(SAXPointsNumber[] pointsOccurenceInPackedRule) { + int res = 0; + for (SAXPointsNumber r : pointsOccurenceInPackedRule) { + if (r.getPointOccurenceNumber() > res) { + res = r.getPointOccurenceNumber(); + } + + } + return res; + } }