-
Notifications
You must be signed in to change notification settings - Fork 0
/
references.bib
3825 lines (3547 loc) · 332 KB
/
references.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@incollection{federrath_apkcombiner:_2015,
address = {Cham},
title = {{ApkCombiner}: {Combining} {Multiple} {Android} {Apps} to {Support} {Inter}-{App} {Analysis}},
volume = {455},
isbn = {978-3-319-18466-1 978-3-319-18467-8},
shorttitle = {{ApkCombiner}},
url = {http://link.springer.com/10.1007/978-3-319-18467-8_34},
abstract = {Android apps are made of components which can leak information between one another using the ICC mechanism. With the growing momentum of Android, a number of research contributions have led to tools for the intra-app analysis of Android apps. Unfortunately, these state-of-the-art approaches, and the associated tools, have long left out the security flaws that arise across the boundaries of single apps, in the interaction between several apps. In this paper, we present a tool called ApkCombiner which aims at reducing an inter-app communication problem to an intra-app inter-component communication problem. In practice, ApkCombiner combines different apps into a single apk on which existing tools can indirectly perform inter-app analysis. We have evaluated ApkCombiner on a dataset of 3,000 real-world Android apps, to demonstrate its capability to support static context-aware inter-app analysis scenarios.},
language = {en},
urldate = {2018-04-10},
booktitle = {{ICT} {Systems} {Security} and {Privacy} {Protection}},
publisher = {Springer International Publishing},
author = {Li, Li and Bartel, Alexandre and Bissyandé, Tegawendé F. and Klein, Jacques and Traon, Yves Le},
editor = {Federrath, Hannes and Gollmann, Dieter},
year = {2015},
doi = {10.1007/978-3-319-18467-8_34},
keywords = {static, characterization, leaks},
pages = {513--527},
file = {Li et al. - 2015 - ApkCombiner Combining Multiple Android Apps to Su.pdf:/home/fmind/Documents/Zotero/storage/M7C5CRE6/Li et al. - 2015 - ApkCombiner Combining Multiple Android Apps to Su.pdf:application/pdf}
}
@article{li_using_2015,
title = {Using {An} {Instrumentation} based {Approach} to {Detect} {Inter}-{Component} {Leaks} in {Android} {Apps}},
language = {en},
author = {Li, Li and Bissyande, Tegawende F and Klein, Jacques and Traon, Yves Le},
month = mar,
year = {2015},
keywords = {detection, static, leaks},
pages = {2},
file = {Li et al. - Using An Instrumentation based Approach to Detect .pdf:/home/fmind/Documents/Zotero/storage/ZVFKLBFW/Li et al. - Using An Instrumentation based Approach to Detect .pdf:application/pdf}
}
@inproceedings{li_potential_2015,
title = {Potential {Component} {Leaks} in {Android} {Apps}: {An} {Investigation} into a {New} {Feature} {Set} for {Malware} {Detection}},
isbn = {978-1-4673-7989-2},
shorttitle = {Potential {Component} {Leaks} in {Android} {Apps}},
url = {http://ieeexplore.ieee.org/document/7272932/},
doi = {10.1109/QRS.2015.36},
abstract = {We discuss the capability of a new feature set for malware detection based on potential component leaks (PCLs). PCLs are defined as sensitive data-flows that involve Android inter-component communications. We show that PCLs are common in Android apps and that malicious applications indeed manipulate significantly more PCLs than benign apps. Then, we evaluate a machine learning-based approach relying on PCLs. Experimental validations show high performance for identifying malware, demonstrating that PCLs can be used for discriminating malicious apps from benign apps.},
language = {en},
urldate = {2018-04-10},
publisher = {IEEE},
author = {Li, Li and Allix, Kevin and Li, Daoyuan and Bartel, Alexandre and Bissyande, Tegawende F. and Klein, Jacques},
month = aug,
year = {2015},
keywords = {detection, static, leaks},
pages = {195--200},
file = {Li et al. - 2015 - Potential Component Leaks in Android Apps An Inve.pdf:/home/fmind/Documents/Zotero/storage/2CLI4DWF/Li et al. - 2015 - Potential Component Leaks in Android Apps An Inve.pdf:application/pdf}
}
@inproceedings{jerome_using_2014,
title = {Using opcode-sequences to detect malicious {Android} applications},
isbn = {978-1-4799-2003-7},
url = {http://ieeexplore.ieee.org/document/6883436/},
doi = {10.1109/ICC.2014.6883436},
abstract = {Recently, the Android platform has seen its number of malicious applications increased sharply. Motivated by the easy application submission process and the number of alternative market places for distributing Android applications, rogue authors are developing constantly new malicious programs. While current anti-virus software mainly relies on signature detection, the issue of alternative malware detection has to be addressed. In this paper, we present a feature based detection mechanism relying on opcode-sequences combined with machine learning techniques. We assess our tool on both a reference dataset known as Genome Project as well as on a wider sample of 40,000 applications retrieved from the Google Play Store.},
language = {en},
urldate = {2018-04-10},
publisher = {IEEE},
author = {Jerome, Quentin and Allix, Kevin and State, Radu and Engel, Thomas},
month = jun,
year = {2014},
keywords = {detection, static},
pages = {914--919},
file = {Jerome et al. - 2014 - Using opcode-sequences to detect malicious Android.pdf:/home/fmind/Documents/Zotero/storage/AJ77Z362/Jerome et al. - 2014 - Using opcode-sequences to detect malicious Android.pdf:application/pdf}
}
@incollection{piessens_are_2015,
address = {Cham},
title = {Are {Your} {Training} {Datasets} {Yet} {Relevant}?},
volume = {8978},
isbn = {978-3-319-15617-0 978-3-319-15618-7},
url = {http://link.springer.com/10.1007/978-3-319-15618-7_5},
abstract = {In this paper, we consider the relevance of timeline in the construction of datasets, to highlight its impact on the performance of a machine learning-based malware detection scheme. Typically, we show that simply picking a random set of known malware to train a malware detector, as it is done in many assessment scenarios from the literature, yields significantly biased results. In the process of assessing the extent of this impact through various experiments, we were also able to confirm a number of intuitive assumptions about Android malware. For instance, we discuss the existence of Android malware lineages and how they could impact the performance of malware detection in the wild.},
language = {en},
urldate = {2018-04-10},
booktitle = {Engineering {Secure} {Software} and {Systems}},
publisher = {Springer International Publishing},
author = {Allix, Kevin and Bissyandé, Tegawendé F. and Klein, Jacques and Le Traon, Yves},
editor = {Piessens, Frank and Caballero, Juan and Bielova, Nataliia},
year = {2015},
doi = {10.1007/978-3-319-15618-7_5},
keywords = {study},
pages = {51--67},
file = {Allix et al. - 2015 - Are Your Training Datasets Yet Relevant.pdf:/home/fmind/Documents/Zotero/storage/XJ9UGXG7/Allix et al. - 2015 - Are Your Training Datasets Yet Relevant.pdf:application/pdf}
}
@article{allix_empirical_2016,
title = {Empirical assessment of machine learning-based malware detectors for {Android}: {Measuring} the gap between in-the-lab and in-the-wild validation scenarios},
volume = {21},
issn = {1382-3256, 1573-7616},
shorttitle = {Empirical assessment of machine learning-based malware detectors for {Android}},
url = {http://link.springer.com/10.1007/s10664-014-9352-6},
doi = {10.1007/s10664-014-9352-6},
abstract = {To address the issue of malware detection through large sets of applications, researchers have recently started to investigate the capabilities of machine-learning techniques for proposing effective approaches. So far, several promising results were recorded in the literature, many approaches being assessed with what we call in the lab validation scenarios. This paper revisits the purpose of malware detection to discuss whether such in the lab validation scenarios provide reliable indications on the performance of malware detectors in real-world settings, aka in the wild.},
language = {en},
number = {1},
urldate = {2018-04-10},
journal = {Empirical Software Engineering},
author = {Allix, Kevin and Bissyandé, Tegawendé F. and Jérome, Quentin and Klein, Jacques and State, Radu and Le Traon, Yves},
month = feb,
year = {2016},
keywords = {study},
pages = {183--211},
file = {Allix et al. - 2016 - Empirical assessment of machine learning-based mal.pdf:/home/fmind/Documents/Zotero/storage/8VFQCJBF/Allix et al. - 2016 - Empirical assessment of machine learning-based mal.pdf:application/pdf}
}
@inproceedings{li_iccta:_2015,
title = {{IccTA}: {Detecting} {Inter}-{Component} {Privacy} {Leaks} in {Android} {Apps}},
isbn = {978-1-4799-1934-5},
shorttitle = {{IccTA}},
url = {http://ieeexplore.ieee.org/document/7194581/},
doi = {10.1109/ICSE.2015.48},
abstract = {Shake Them All is a popular “Wallpaper” application exceeding millions of downloads on Google Play. At installation, this application is given permission to (1) access the Internet (for updating wallpapers) and (2) use the device microphone (to change background following noise changes). With these permissions, the application could silently record user conversations and upload them remotely. To give more confidence about how Shake Them All actually processes what it records, it is necessary to build a precise analysis tool that tracks the flow of any sensitive data from its source point to any sink, especially if those are in different components.},
language = {en},
urldate = {2018-04-10},
publisher = {IEEE},
author = {Li, Li and Bartel, Alexandre and Bissyande, Tegawende F. and Klein, Jacques and Le Traon, Yves and Arzt, Steven and Rasthofer, Siegfried and Bodden, Eric and Octeau, Damien and McDaniel, Patrick},
month = may,
year = {2015},
keywords = {detection, static, leaks},
pages = {280--291},
file = {Li et al. - 2015 - IccTA Detecting Inter-Component Privacy Leaks in .pdf:/home/fmind/Documents/Zotero/storage/FP337E2L/Li et al. - 2015 - IccTA Detecting Inter-Component Privacy Leaks in .pdf:application/pdf}
}
@inproceedings{li_automatically_2014,
title = {Automatically {Exploiting} {Potential} {Component} {Leaks} in {Android} {Applications}},
isbn = {978-1-4799-6513-7},
url = {http://ieeexplore.ieee.org/document/7011274/},
doi = {10.1109/TrustCom.2014.50},
abstract = {We present PCLeaks, a tool based on intercomponent communication (ICC) vulnerabilities to perform dataflow analysis on Android applications to find potential component leaks that could potentially be exploited by other components. To evaluate our approach, we run PCLeaks on 2000 apps randomly selected from the Google Play store. PCLeaks reports 986 potential component leaks in 185 apps. For each leak reported by PCLeaks, PCLeaksValidator automatically generates an Android app which tries to exploit the leak. By manually running a subset of the generated apps, we find that 75\% of the reported leaks are exploitable leaks.},
language = {en},
urldate = {2018-04-10},
publisher = {IEEE},
author = {Li, Li and Bartel, Alexandre and Klein, Jacques and Traon, Yves Le},
month = sep,
year = {2014},
keywords = {leaks},
pages = {388--397},
file = {Li et al. - 2014 - Automatically Exploiting Potential Component Leaks.pdf:/home/fmind/Documents/Zotero/storage/4ZA23SCB/Li et al. - 2014 - Automatically Exploiting Potential Component Leaks.pdf:application/pdf}
}
@inproceedings{allix_forensic_2014,
title = {A {Forensic} {Analysis} of {Android} {Malware} -- {How} is {Malware} {Written} and {How} it {Could} {Be} {Detected}?},
isbn = {978-1-4799-3575-8},
url = {http://ieeexplore.ieee.org/document/6899240/},
doi = {10.1109/COMPSAC.2014.61},
abstract = {We consider in this paper the analysis of a large set of malware and benign applications from the Android ecosystem. Although a large body of research work has dealt with Android malware over the last years, none has addressed it from a forensic point of view.},
language = {en},
urldate = {2018-04-10},
publisher = {IEEE},
author = {Allix, Kevin and Jerome, Quentin and Bissyande, Tegawende F. and Klein, Jacques and State, Radu and Traon, Yves Le},
month = jul,
year = {2014},
keywords = {study},
pages = {384--393},
file = {Allix et al. - 2014 - A Forensic Analysis of Android Malware -- How is M.pdf:/home/fmind/Documents/Zotero/storage/4QV3I6RA/Allix et al. - 2014 - A Forensic Analysis of Android Malware -- How is M.pdf:application/pdf}
}
@inproceedings{allix_large-scale_2014,
title = {Large-scale machine learning-based malware detection: confronting the "10-fold cross validation" scheme with reality},
isbn = {978-1-4503-2278-2},
shorttitle = {Large-scale machine learning-based malware detection},
url = {http://dl.acm.org/citation.cfm?doid=2557547.2557587},
doi = {10.1145/2557547.2557587},
abstract = {To address the issue of malware detection, researchers have recently started to investigate the capabilities of machinelearning techniques for proposing effective approaches. Several promising results were recorded in the literature, many approaches being assessed with the common “10-Fold cross validation” scheme. This paper revisits the purpose of malware detection to discuss the adequacy of the “10-Fold” scheme for validating techniques that may not perform well in reality. To this end, we have devised several Machine Learning classifiers that rely on a novel set of features built from applications’ CFGs. We use a sizeable dataset of over 50,000 Android applications collected from sources where state-ofthe art approaches have selected their data. We show that our approach outperforms existing machine learning-based approaches. However, this high performance on usual-size datasets does not translate in high performance in the wild.},
language = {en},
urldate = {2018-04-10},
publisher = {ACM Press},
author = {Allix, Kevin and Bissyandé, Tegawendé F. and Jérome, Quentin and Klein, Jacques and State, Radu and Le Traon, Yves},
year = {2014},
keywords = {detection, design},
pages = {163--166},
file = {Allix et al. - 2014 - Large-scale machine learning-based malware detecti.pdf:/home/fmind/Documents/Zotero/storage/GLMC679F/Allix et al. - 2014 - Large-scale machine learning-based malware detecti.pdf:application/pdf}
}
@article{allix_machine_2014,
title = {Machine {Learning}-{Based} {Malware} {Detection} for {Android} {Applications}: {History} {Matters}!},
abstract = {Machine Learning-based malware detection is a promising scalable method for identifying suspicious applications. In particular, in today’s mobile computing realm where thousands of applications are daily poured into markets, such a technique could be valuable to guarantee a strong filtering of malicious apps. The success of machine-learning approaches however is highly dependent on (1) the quality of the datasets that are used for training and of (2) the appropriateness of the tested datasets with regards to the built classifiers. Unfortunately, there is scarce mention of these aspects in the evaluation of existing state-of-the-art approaches in the literature.},
language = {en},
author = {Allix, Kevin and Klein, Jacques},
month = may,
year = {2014},
keywords = {detection, design},
pages = {17},
file = {Allix and Klein - Machine Learning-Based Malware Detection for Andro.pdf:/home/fmind/Documents/Zotero/storage/K6UN2DFM/Allix and Klein - Machine Learning-Based Malware Detection for Andro.pdf:application/pdf}
}
@article{li_detecting_2014,
title = {Detecting privacy leaks in {Android} {Apps}},
abstract = {The number of Android apps have grown explosively in recent years and the number of apps leaking private data have also grown. It is necessary to make sure all the apps are not leaking private data before putting them to the app markets and thereby a privacy leaks detection tool is needed. We propose a static taint analysis approach which leverages the control-flow graph (CFG) of apps to detect privacy leaks among Android apps. We tackle three problems related to intercomponent communication (ICC), lifecycle of components and callback mechanism making the CFG imprecision. To bridge this gap, we explicitly connect the discontinuities of the CFG to provide a precise CFG. Based on the precise CFG, we aim at providing a taint analysis approach to detect intra-component privacy leaks, inter-component privacy leaks and also inter-app privacy leaks.},
language = {en},
journal = {Li Li},
author = {Li, Li and Bartel, Alexandre and Klein, Jacques},
month = feb,
year = {2014},
keywords = {detection, leaks},
pages = {6},
file = {Li et al. - Detecting privacy leaks in Android Apps.pdf:/home/fmind/Documents/Zotero/storage/PFHIPK2P/Li et al. - Detecting privacy leaks in Android Apps.pdf:application/pdf}
}
@article{li_using_2014,
title = {Using {A} {Path} {Matching} {Algorithm} to {Detect} {Inter}-{Component} {Leaks} in {Android} {Apps}},
language = {en},
author = {Li, Li and Bartel, Alexandre and Klein, Jacques},
month = mar,
year = {2014},
keywords = {detection, static, leaks},
pages = {2},
file = {Li et al. - Using A Path Matching Algorithm to Detect Inter-Co.pdf:/home/fmind/Documents/Zotero/storage/P9BCESZ4/Li et al. - Using A Path Matching Algorithm to Detect Inter-Co.pdf:application/pdf}
}
@article{octeau_effective_2013,
title = {Effective {Inter}-{Component} {Communication} {Mapping} in {Android} with {Epicc}: {An} {Essential} {Step} {Towards} {Holistic} {Security} {Analysis}},
abstract = {Many threats present in smartphones are the result of interactions between application components, not just artifacts of single components. However, current techniques for identifying inter-application communication are ad hoc and do not scale to large numbers of applications. In this paper, we reduce the discovery of inter-component communication (ICC) in smartphones to an instance of the Interprocedural Distributive Environment (IDE) problem, and develop a sound static analysis technique targeted to the Android platform. We apply this analysis to 1,200 applications selected from the Play store and characterize the locations and substance of their ICC. Experiments show that full specifications for ICC can be identified for over 93\% of ICC locations for the applications studied. Further the analysis scales well; analysis of each application took on average 113 seconds to complete. Epicc, the resulting tool, finds ICC vulnerabilities with far fewer false positives than the next best tool. In this way, we develop a scalable vehicle to extend current security analysis to entire collections of applications as well as the interfaces they export.},
language = {en},
author = {Octeau, Damien and McDaniel, Patrick and Jha, Somesh and Bartel, Alexandre and Bodden, Eric},
month = aug,
year = {2013},
keywords = {characterization},
pages = {16},
file = {Octeau et al. - Effective Inter-Component Communication Mapping in.pdf:/home/fmind/Documents/Zotero/storage/U8752DQZ/Octeau et al. - Effective Inter-Component Communication Mapping in.pdf:application/pdf}
}
@inproceedings{bartel_model_2011,
title = {Model {Driven} {Mutation} {Applied} to {Adaptative} {Systems} {Testing}},
isbn = {978-1-4577-0019-4},
url = {http://ieeexplore.ieee.org/document/5954440/},
doi = {10.1109/ICSTW.2011.24},
abstract = {Dynamically Adaptive Systems modify their behavior and structure in response to changes in their surrounding environment and according to an adaptation logic. Critical systems increasingly incorporate dynamic adaptation capabilities; examples include disaster relief and space exploration systems. In this paper, we focus on mutation testing of the adaptation logic. We propose a fault model for adaptation logics that classifies faults into environmental completeness and adaptation correctness. Since there are several adaptation logic languages relying on the same underlying concepts, the fault model is expressed independently from specific adaptation languages. Taking benefit from model-driven engineering technology, we express these common concepts in a metamodel and define the operational semantics of mutation operators at this level. Mutation is applied on model elements and model transformations are used to propagate these changes to a given adaptation policy in the chosen formalism. Preliminary results on an adaptive web server highlight the difficulty of killing mutants for adaptive systems, and thus the difficulty of generating efficient tests.},
language = {en},
urldate = {2018-04-10},
publisher = {IEEE},
author = {Bartel, Alexandre and Baudry, Benoit and Munoz, Freddy and Klein, Jacques and Mouelhi, Tejeddine and Le Traon, Yves},
month = mar,
year = {2011},
pages = {408--413},
file = {Bartel et al. - 2011 - Model Driven Mutation Applied to Adaptative System.pdf:/home/fmind/Documents/Zotero/storage/H7E48HUB/Bartel et al. - 2011 - Model Driven Mutation Applied to Adaptative System.pdf:application/pdf}
}
@inproceedings{bartel_dexpler:_2012,
title = {Dexpler: converting {Android} {Dalvik} bytecode to {Jimple} for static analysis with {Soot}},
isbn = {978-1-4503-1490-9},
shorttitle = {Dexpler},
url = {http://dl.acm.org/citation.cfm?doid=2259051.2259056},
doi = {10.1145/2259051.2259056},
abstract = {This paper introduces Dexpler, a software package which converts Dalvik bytecode to Jimple. Dexpler is built on top of Dedexer and Soot. As Jimple is Soot’s main internal representation of code, the Dalvik bytecode can be manipulated with any Jimple based tool, for instance for performing point-to or flow analysis.},
language = {en},
urldate = {2018-04-10},
publisher = {ACM Press},
author = {Bartel, Alexandre and Klein, Jacques and Le Traon, Yves and Monperrus, Martin},
year = {2012},
keywords = {static, characterization},
pages = {27--38},
file = {Bartel et al. - 2012 - Dexpler converting Android Dalvik bytecode to Jim.pdf:/home/fmind/Documents/Zotero/storage/R8VCWFT4/Bartel et al. - 2012 - Dexpler converting Android Dalvik bytecode to Jim.pdf:application/pdf}
}
@inproceedings{bartel_automatically_2012,
title = {Automatically {Securing} {Permission}-{Based} {Software} by {Reducing} the {Attack} {Surface}: {An} {Application} to {Android}},
isbn = {978-1-4503-1204-2},
shorttitle = {Automatically securing permission-based software by reducing the attack surface},
url = {http://dl.acm.org/citation.cfm?doid=2351676.2351722},
doi = {10.1145/2351676.2351722},
abstract = {Android based devices are becoming widespread. As a result and since those devices contain personal and confidential data, the security model of the android software stack has been analyzed extensively. One key feature of the security model is that applications must declare a list of permissions they are using to access resources. Using static analysis, we first extracted a table from the Android API which maps methods to permissions. Then, we use this mapping within a tool we developed to check that applications effectively need all the permissions they declare. Using our tool on a set of android applications, we found out that a non negligible part of the applications do not use all the permissions they declare. Consequently, the attack surface of such applications can be reduced by removing the non-needed permissions.},
language = {en},
urldate = {2018-04-10},
publisher = {ACM Press},
author = {Bartel, Alexandre and Klein, Jacques and Le Traon, Yves and Monperrus, Martin},
year = {2012},
keywords = {static, permissions},
pages = {274},
file = {Bartel et al. - 2012 - Automatically securing permission-based software b.pdf:/home/fmind/Documents/Zotero/storage/7SRW852B/Bartel et al. - 2012 - Automatically securing permission-based software b.pdf:application/pdf}
}
@article{hurier_lack_2016,
title = {On the {Lack} of {Consensus} in {Anti}-{Virus} {Decisions}: {Metrics} and {Insights} on {Building} {Ground} {Truths} of {Android} {Malware} with {VirusTotal}},
copyright = {All rights reserved},
abstract = {There is generally a lack of consensus in Antivirus (AV) engines' decisions on a given sample. This challenges the building of authoritative ground-truth datasets. Instead, researchers and practitioners may rely on unvalidated approaches to build their ground truth, e.g., by considering decisions from a selected set of Antivirus vendors or by setting up a threshold number of positive detections before classifying a sample. Both approaches are biased as they implicitly either decide on ranking AV products, or they consider that all AV decisions have equal weights. In this paper, we extensively investigate the lack of agreement among AV engines. To that end, we propose a set of metrics that quantitatively describe the di erent dimensions of this lack of consensus. We show how our metrics can bring important insights by using the detection results of 66 AV products on 2 million Android apps as a case study. Our analysis focuses not only on AV binary decision but also on the notoriously hard problem of labels that AVs associate with suspicious les, and allows to highlight biases hidden in the collection of a malware ground truth a foundation stone of any machine learning-based malware detection approach.},
language = {en},
author = {Hurier, Médéric and Allix, Kevin and Bissyandé, Tegawendé F and Klein, Jacques and Traon, Yves Le},
month = jul,
year = {2016},
keywords = {vt-analysis},
pages = {20},
file = {Hurier et al. - Metrics and Insights on Building Ground Truths of .pdf:/home/fmind/Documents/Zotero/storage/XG3AZ6XT/Hurier et al. - Metrics and Insights on Building Ground Truths of .pdf:application/pdf}
}
@misc{hurier_idea_nodate,
title = {Idea a {Machine} {Apprenticeship} {Approach} for {Android} {Malware} {Analysis}.pdf},
copyright = {All rights reserved},
abstract = {Android malware are becoming more diverse and complex over the years. To manage their evolution, security researchers have proposed supervised learning models able to automatically train a system to perform pattern recognition tasks. In practice, these systems have shown promising results, but are not yet as transparent and adaptable as the analysis of human experts. They also suffer from several limitations that could limit their implementation in a production environment. In this paper, we propose to evaluate a technique from the field of Artificial Intelligence called Machine Apprenticeship that could address these short-comings. Instead of manually selected features, this approach promotes the use of expert demonstrations that guide the system through learning a complex task. This idea was successfully implemented in other fields, such in video games and robot locomotion, but is still unexplored in the security domain. Applied to malware analysis, this method could lead to autonomous systems more suited for anomaly detection operations.},
author = {Hurier, Médéric},
file = {Idea a Machine Apprenticeship Approach for Android Malware Analysis.pdf:/home/fmind/Documents/Zotero/storage/QE4NP4CP/Idea a Machine Apprenticeship Approach for Android Malware Analysis.pdf:application/pdf}
}
@article{abadi_learning_2016,
title = {{LEARNING} {TO} {PROTECT} {COMMUNICATIONS} {WITH} {ADVERSARIAL} {NEURAL} {CRYPTOGRAPHY}},
abstract = {We ask whether neural networks can learn to use secret keys to protect information from other neural networks. Specifically, we focus on ensuring confidentiality properties in a multiagent system, and we specify those properties in terms of an adversary. Thus, a system may consist of neural networks named Alice and Bob, and we aim to limit what a third neural network named Eve learns from eavesdropping on the communication between Alice and Bob. We do not prescribe specific cryptographic algorithms to these neural networks; instead, we train end-to-end, adversarially. We demonstrate that the neural networks can learn how to perform forms of encryption and decryption, and also how to apply these operations selectively in order to meet confidentiality goals.},
language = {en},
author = {Abadi, Martın and Andersen, David G},
year = {2016},
pages = {15},
file = {Abadi and Andersen - 2016 - LEARNING TO PROTECT COMMUNICATIONS WITH ADVERSARIA.pdf:/home/fmind/Documents/Zotero/storage/2RWWK7AG/Abadi and Andersen - 2016 - LEARNING TO PROTECT COMMUNICATIONS WITH ADVERSARIA.pdf:application/pdf}
}
@article{marks_out_2006,
title = {Out of the {Tar} {Pit}},
abstract = {Complexity is the single major difficulty in the successful development of large-scale software systems. Following Brooks we distinguish accidental from essential difficulty, but disagree with his premise that most complexity remaining in contemporary systems is essential. We identify common causes of complexity and discuss general approaches which can be taken to eliminate them where they are accidental in nature. To make things more concrete we then give an outline for a potential complexity-minimizing approach based on functional programming and Codd’s relational model of data.},
language = {en},
author = {Marks, Peter},
year = {2006},
pages = {66},
file = {Marks - Ben Moseley [email protected]:/home/fmind/Documents/Zotero/storage/ET9BD8Q6/Marks - Ben Moseley [email protected]:application/pdf}
}
@article{brumley_theory_2008,
title = {Theory and {Techniques} for {Automatic} {Generation} of {Vulnerability}-{Based} {Signatures}},
volume = {5},
issn = {1545-5971},
url = {http://ieeexplore.ieee.org/document/4624274/},
doi = {10.1109/TDSC.2008.55},
abstract = {In this paper, we explore the problem of creating vulnerability signatures. A vulnerability signature is based on a program vulnerability and is not specific to any particular exploit. The advantage of vulnerability signatures is that their quality can be guaranteed. In particular, we create vulnerability signatures from the vulnerable program itself, such that they are guaranteed to have zero false positives by construction. We show how to automate signature creation for vulnerabilities that can be detected by a runtime monitor. There is no one right signature representation for a vulnerability. We introduce a formalism and way of thinking about vulnerability signature generation that is analysis centric instead of representation specific. In particular, a signature can be represented in many ways, from using regular expression to using a full Turing-complete language. Previous systems have mostly focused on a particular point in the design space. We show how to approximate the language of a vulnerability in many different language classes, each of which has unique properties and benefits, by performing analysis on the program binary and vulnerability. Our approach also considers multiple-path vulnerabilities. A multiple-path vulnerability is a vulnerability that can be exploited through several different code paths. For example, a Web server may have a vulnerability in a URL handling routine that is called for many different types of requests. We demonstrate techniques that can create signatures that cover multiple paths an exploit may take. We have had to develop new algorithms to cope with the problem where enumerating vulnerable paths leads to an exponential explosion. We develop a new approach that captures the logical semantics of multiple vulnerable program paths in Oðn2Þ space (where n is the size of the program) instead of exponential. We provide a formal definition of a vulnerability signature and investigate the computational complexity of creating and matching vulnerability signatures. We systematically explore the design space of vulnerability signatures. We also provide specific techniques for creating vulnerability signatures in a variety of language classes. In order to demonstrate our techniques, we have built a prototype system. Our experiments show that we can, using a single exploit, automatically generate a vulnerability signature as a regular expression, as a small program, or as a system of constraints. We demonstrate techniques for creating signatures of vulnerabilities that can be exploited via multiple program paths. Our results indicate that our approach is a viable option for signature generation, especially when guarantees are desired.},
language = {en},
number = {4},
urldate = {2018-04-10},
journal = {IEEE Transactions on Dependable and Secure Computing},
author = {Brumley, D. and Newsome, J. and Song, D. and {Hao Wang} and Jha, S.},
month = oct,
year = {2008},
pages = {224--241},
file = {Brumley et al. - 2008 - Theory and Techniques for Automatic Generation of .pdf:/home/fmind/Documents/Zotero/storage/QXH7YAYJ/Brumley et al. - 2008 - Theory and Techniques for Automatic Generation of .pdf:application/pdf}
}
@inproceedings{yamaguchi_modeling_2014,
title = {Modeling and {Discovering} {Vulnerabilities} with {Code} {Property} {Graphs}},
isbn = {978-1-4799-4686-0},
url = {http://ieeexplore.ieee.org/document/6956589/},
doi = {10.1109/SP.2014.44},
abstract = {The vast majority of security breaches encountered today are a direct result of insecure code. Consequently, the protection of computer systems critically depends on the rigorous identification of vulnerabilities in software, a tedious and errorprone process requiring significant expertise. Unfortunately, a single flaw suffices to undermine the security of a system and thus the sheer amount of code to audit plays into the attacker’s cards. In this paper, we present a method to effectively mine large amounts of source code for vulnerabilities. To this end, we introduce a novel representation of source code called a code property graph that merges concepts of classic program analysis, namely abstract syntax trees, control flow graphs and program dependence graphs, into a joint data structure. This comprehensive representation enables us to elegantly model templates for common vulnerabilities with graph traversals that, for instance, can identify buffer overflows, integer overflows, format string vulnerabilities, or memory disclosures. We implement our approach using a popular graph database and demonstrate its efficacy by identifying 18 previously unknown vulnerabilities in the source code of the Linux kernel.},
language = {en},
urldate = {2018-04-10},
publisher = {IEEE},
author = {Yamaguchi, Fabian and Golde, Nico and Arp, Daniel and Rieck, Konrad},
month = may,
year = {2014},
pages = {590--604},
file = {Yamaguchi et al. - 2014 - Modeling and Discovering Vulnerabilities with Code.pdf:/home/fmind/Documents/Zotero/storage/GBR6WXKT/Yamaguchi et al. - 2014 - Modeling and Discovering Vulnerabilities with Code.pdf:application/pdf}
}
@article{wasserstein_asas_2016,
title = {The {ASA}'s {Statement} on p-{Values}: {Context}, {Process}, and {Purpose}},
volume = {70},
issn = {0003-1305, 1537-2731},
shorttitle = {The {ASA}'s {Statement} on \textit{p} -{Values}},
url = {https://www.tandfonline.com/doi/full/10.1080/00031305.2016.1154108},
doi = {10.1080/00031305.2016.1154108},
language = {en},
number = {2},
urldate = {2018-04-10},
journal = {The American Statistician},
author = {Wasserstein, Ronald L. and Lazar, Nicole A.},
month = apr,
year = {2016},
pages = {129--133},
file = {Wasserstein and Lazar - 2016 - The ASA's Statement on ipi -Values Context, .pdf:/home/fmind/Documents/Zotero/storage/7PD2YCF6/Wasserstein and Lazar - 2016 - The ASA's Statement on ipi -Values Context, .pdf:application/pdf}
}
@article{breiman_statistical_2001,
title = {Statistical {Modeling}: {The} {Two} {Cultures}},
abstract = {There are two cultures in the use of statistical modeling to reach conclusions from data. One assumes that the data are generated by a given stochastic data model. The other uses algorithmic models and treats the data mechanism as unknown. The statistical community has been committed to the almost exclusive use of data models. This commitment has led to irrelevant theory, questionable conclusions, and has kept statisticians from working on a large range of interesting current problems. Algorithmic modeling, both in theory and practice, has developed rapidly in fields outside statistics. It can be used both on large complex data sets and as a more accurate and informative alternative to data modeling on smaller data sets. If our goal as a field is to use data to solve problems, then we need to move away from exclusive dependence on data models and adopt a more diverse set of tools.},
language = {en},
journal = {THE TWO CULTURES},
author = {Breiman, Leo},
month = aug,
year = {2001},
pages = {33},
file = {Breiman - Statistical Modeling The Two Cultures.pdf:/home/fmind/Documents/Zotero/storage/XESGGSFP/Breiman - Statistical Modeling The Two Cultures.pdf:application/pdf}
}
@inproceedings{arcuri_practical_2011,
title = {A practical guide for using statistical tests to assess randomized algorithms in software engineering},
isbn = {978-1-4503-0445-0},
url = {http://portal.acm.org/citation.cfm?doid=1985793.1985795},
doi = {10.1145/1985793.1985795},
abstract = {Randomized algorithms have been used to successfully address many different types of software engineering problems. This type of algorithms employ a degree of randomness as part of their logic. Randomized algorithms are useful for difficult problems where a precise solution cannot be derived in a deterministic way within reasonable time. However, randomized algorithms produce different results on every run when applied to the same problem instance. It is hence important to assess the effectiveness of randomized algorithms by collecting data from a large enough number of runs. The use of rigorous statistical tests is then essential to provide support to the conclusions derived by analyzing such data. In this paper, we provide a systematic review of the use of randomized algorithms in selected software engineering venues in 2009. Its goal is not to perform a complete survey but to get a representative snapshot of current practice in software engineering research. We show that randomized algorithms are used in a significant percentage of papers but that, in most cases, randomness is not properly accounted for. This casts doubts on the validity of most empirical results assessing randomized algorithms. There are numerous statistical tests, based on different assumptions, and it is not always clear when and how to use these tests. We hence provide practical guidelines to support empirical research on randomized algorithms in software engineering.},
language = {en},
urldate = {2018-04-10},
publisher = {ACM Press},
author = {Arcuri, Andrea and Briand, Lionel},
year = {2011},
pages = {1},
file = {Arcuri and Briand - 2011 - A practical guide for using statistical tests to a.pdf:/home/fmind/Documents/Zotero/storage/2YN8IC4P/Arcuri and Briand - 2011 - A practical guide for using statistical tests to a.pdf:application/pdf}
}
@article{sebastiani_tutorial_2010,
title = {A {Tutorial} on {Probability} {Theory}},
language = {en},
author = {Sebastiani, Paola},
year = {2010},
pages = {25},
file = {Sebastiani - A Tutorial on Probability Theory.pdf:/home/fmind/Documents/Zotero/storage/PHB5FNPZ/Sebastiani - A Tutorial on Probability Theory.pdf:application/pdf}
}
@incollection{hutchison_firma:_2013,
address = {Berlin, Heidelberg},
title = {{FIRMA}: {Malware} {Clustering} and {Network} {Signature} {Generation} with {Mixed} {Network} {Behaviors}},
volume = {8145},
isbn = {978-3-642-41283-7 978-3-642-41284-4},
shorttitle = {{FIRMA}},
url = {http://link.springer.com/10.1007/978-3-642-41284-4_8},
abstract = {The ever-increasing number of malware families and polymorphic variants creates a pressing need for automatic tools to cluster the collected malware into families and generate behavioral signatures for their detection. Among these, network traffic is a powerful behavioral signature and network signatures are widely used by network administrators. In this paper we present FIRMA, a tool that given a large pool of network traffic obtained by executing unlabeled malware binaries, generates a clustering of the malware binaries into families and a set of network signatures for each family. Compared with prior tools, FIRMA produces network signatures for each of the network behaviors of a family, regardless of the type of traffic the malware uses (e.g., HTTP, IRC, SMTP, TCP, UDP). We have implemented FIRMA and evaluated it on two recent datasets comprising nearly 16,000 unique malware binaries. Our results show that FIRMA’s clustering has very high precision (100\% on a labeled dataset) and recall (97.7\%). We compare FIRMA’s signatures with manually generated ones, showing that they are as good (often better), while generated in a fraction of the time.},
language = {en},
urldate = {2018-04-10},
booktitle = {Research in {Attacks}, {Intrusions}, and {Defenses}},
publisher = {Springer Berlin Heidelberg},
author = {Rafique, M. Zubair and Caballero, Juan},
editor = {Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Sudan, Madhu and Terzopoulos, Demetri and Tygar, Doug and Vardi, Moshe Y. and Weikum, Gerhard and Stolfo, Salvatore J. and Stavrou, Angelos and Wright, Charles V.},
year = {2013},
doi = {10.1007/978-3-642-41284-4_8},
pages = {144--163},
file = {Rafique and Caballero - 2013 - FIRMA Malware Clustering and Network Signature Ge.pdf:/home/fmind/Documents/Zotero/storage/RR8XRIDK/Rafique and Caballero - 2013 - FIRMA Malware Clustering and Network Signature Ge.pdf:application/pdf}
}
@article{perdisci_behavioral_2010,
title = {Behavioral {Clustering} of {HTTP}-{Based} {Malware} and {Signature} {Generation} {Using} {Malicious} {Network} {Traces}},
abstract = {We present a novel network-level behavioral malware clustering system. We focus on analyzing the structural similarities among malicious HTTP traffic traces generated by executing HTTP-based malware. Our work is motivated by the need to provide quality input to algorithms that automatically generate network signatures. Accordingly, we define similarity metrics among HTTP traces and develop our system so that the resulting clusters can yield high-quality malware signatures.},
language = {en},
author = {Perdisci, Roberto and Lee, Wenke and Feamster, Nick},
month = apr,
year = {2010},
pages = {14},
file = {Perdisci et al. - Behavioral Clustering of HTTP-Based Malware and Si.pdf:/home/fmind/Documents/Zotero/storage/NW8A3IMF/Perdisci et al. - Behavioral Clustering of HTTP-Based Malware and Si.pdf:application/pdf}
}
@article{le_traon_design_2006,
title = {Design by {Contract} to {Improve} {Software} {Vigilance}},
volume = {32},
issn = {0098-5589, 1939-3520},
url = {http://ieeexplore.ieee.org/document/1703388/},
doi = {10.1109/TSE.2006.79},
abstract = {Design by Contract is a lightweight technique for embedding elements of formal specification (such as invariants, pre and postconditions) into an object-oriented design. When contracts are made executable, they can play the role of embedded, online oracles. Executable contracts allow components to be responsive to erroneous states and, thus, may help in detecting and locating faults. In this paper, we define Vigilance as the degree to which a program is able to detect an erroneous state at runtime. Diagnosability represents the effort needed to locate a fault once it has been detected. In order to estimate the benefit of using Design by Contract, we formalize both notions of Vigilance and Diagnosability as software quality measures. The main steps of measure elaboration are given, from informal definitions of the factors to be measured to the mathematical model of the measures. As is the standard in this domain, the parameters are then fixed through actual measures, based on a mutation analysis in our case. Several measures are presented that reveal and estimate the contribution of contracts to the overall quality of a system in terms of vigilance and diagnosability.},
language = {en},
number = {8},
urldate = {2018-04-10},
journal = {IEEE Transactions on Software Engineering},
author = {Le Traon, Y. and Baudry, B. and Jezequel, J.-M.},
month = aug,
year = {2006},
pages = {571--586},
file = {Le Traon et al. - 2006 - Design by Contract to Improve Software Vigilance.pdf:/home/fmind/Documents/Zotero/storage/43C487W2/Le Traon et al. - 2006 - Design by Contract to Improve Software Vigilance.pdf:application/pdf}
}
@article{jacobs_pathologies_2009,
title = {The {Pathologies} of {Big} {Data}},
language = {en},
author = {Jacobs, Adam},
month = jul,
year = {2009},
pages = {12},
file = {Jacobs - The Pathologies of Big Data.pdf:/home/fmind/Documents/Zotero/storage/X34LBYUD/Jacobs - The Pathologies of Big Data.pdf:application/pdf}
}
@article{white_evolving_2011,
title = {The {Evolving} {Role} of the {Enterprise} {Data} {Warehouse} in the {Era} of {Big} {Data} {Analytics}},
language = {en},
author = {White, A Kimball Group and Kimball, Ralph},
year = {2011},
pages = {33},
file = {White and Kimball - The Evolving Role of the Enterprise Data Warehouse.pdf:/home/fmind/Documents/Zotero/storage/TM3K9ZRM/White and Kimball - The Evolving Role of the Enterprise Data Warehouse.pdf:application/pdf}
}
@article{provost_data_2013,
title = {Data {Science} and its {Relationship} to {Big} {Data} and {Data}-{Driven} {Decision} {Making}},
volume = {1},
issn = {2167-6461, 2167-647X},
url = {http://online.liebertpub.com/doi/10.1089/big.2013.1508},
doi = {10.1089/big.2013.1508},
abstract = {Companies have realized they need to hire data scientists, academic institutions are scrambling to put together data-science programs, and publications are touting data science as a hot—even ‘‘sexy’’—career choice. However, there is confusion about what exactly data science is, and this confusion could lead to disillusionment as the concept diffuses into meaningless buzz. In this article, we argue that there are good reasons why it has been hard to pin down exactly what is data science. One reason is that data science is intricately intertwined with other important concepts also of growing importance, such as big data and data-driven decision making. Another reason is the natural tendency to associate what a practitioner does with the definition of the practitioner’s field; this can result in overlooking the fundamentals of the field. We believe that trying to define the boundaries of data science precisely is not of the utmost importance. We can debate the boundaries of the field in an academic setting, but in order for data science to serve business effectively, it is important (i) to understand its relationships to other important related concepts, and (ii) to begin to identify the fundamental principles underlying data science. Once we embrace (ii), we can much better understand and explain exactly what data science has to offer. Furthermore, only once we embrace (ii) should we be comfortable calling it data science. In this article, we present a perspective that addresses all these concepts. We close by offering, as examples, a partial list of fundamental principles underlying data science.},
language = {en},
number = {1},
urldate = {2018-04-10},
journal = {Big Data},
author = {Provost, Foster and Fawcett, Tom},
month = mar,
year = {2013},
pages = {51--59},
file = {Provost and Fawcett - 2013 - Data Science and its Relationship to Big Data and .pdf:/home/fmind/Documents/Zotero/storage/AVAWY5CS/Provost and Fawcett - 2013 - Data Science and its Relationship to Big Data and .pdf:application/pdf}
}
@article{nigam_text_2000,
title = {Text {Classification} from {Labeled} and {Unlabeled} {Documents} using {EM}},
abstract = {This paper shows that the accuracy of learned text classifiers can be improved by augmenting a small number of labeled training documents with a large pool of unlabeled documents. This is important because in many text classification problems obtaining training labels is expensive, while large quantities of unlabeled documents are readily available.},
language = {en},
author = {NIGAM, KAMAL},
year = {2000},
pages = {32},
file = {NIGAM - Text Classification from Labeled and Unlabeled Doc.pdf:/home/fmind/Documents/Zotero/storage/BXWGF59D/NIGAM - Text Classification from Labeled and Unlabeled Doc.pdf:application/pdf}
}
@article{goethals_survey_2003,
title = {Survey on {Frequent} {Pattern} {Mining}},
language = {en},
author = {Goethals, Bart},
year = {2003},
pages = {43},
file = {Goethals - Survey on Frequent Pattern Mining.pdf:/home/fmind/Documents/Zotero/storage/2E7KZRXB/Goethals - Survey on Frequent Pattern Mining.pdf:application/pdf}
}
@inproceedings{melnik_similarity_2002,
title = {Similarity flooding: a versatile graph matching algorithm and its application to schema matching},
isbn = {978-0-7695-1531-1},
shorttitle = {Similarity flooding},
url = {http://ieeexplore.ieee.org/document/994702/},
doi = {10.1109/ICDE.2002.994702},
abstract = {Matching elements of two data schemas or two data instances plays a key role in data warehousing, e-business, or even biochemical applications. In this paper we present a matching algorithm based on a fixpoint computation that is usable across different scenarios. The algorithm takes two graphs (schemas, catalogs, or other data structures) as input, and produces as output a mapping between corresponding nodes of the graphs. Depending on the matching goal, a subset of the mapping is chosen using filters. After our algorithm runs, we expect a human to check and if necessary adjust the results. As a matter of fact, we evaluate the ‘accuracy’ of the algorithm by counting the number of needed adjustments. We conducted a user study, in which our accuracy metric was used to estimate the labor savings that the users could obtain by utilizing our algorithm to obtain an initial matching. Finally, we illustrate how our matching algorithm is deployed as one of several high-level operators in an implemented testbed for managing information models and mappings.},
language = {en},
urldate = {2018-04-10},
publisher = {IEEE Comput. Soc},
author = {Melnik, S. and Garcia-Molina, H. and Rahm, E.},
year = {2002},
pages = {117--128},
file = {Melnik et al. - 2002 - Similarity flooding a versatile graph matching al.pdf:/home/fmind/Documents/Zotero/storage/NAUG8JN3/Melnik et al. - 2002 - Similarity flooding a versatile graph matching al.pdf:application/pdf}
}
@article{wang_semap:_nodate,
title = {{SeMap}: {A} {Generic} {Mapping} {Construction} {System}},
abstract = {Most previous schema mapping works focus on creating mappings in specific data models for data transformation, failing to capture a richer set of possible relationships between schema elements. For example, most schema matching approaches might discover that ‘TA’ in one schema equals ‘grad TA’ in another one, even though the relationship can be modeled more accurately by saying that ‘grad TA’ is a specialization of ‘TA’. Deepening the mapping semantics in turn allow richer application semantics. This paper presents and proves the effectiveness of SeMap, a system that constructs a complex, semantically richer mapping (including ‘Has-a’, ‘Is-a’, ‘Associates’ and ‘Equivalent’ relationship types) that can be used across data models. We achieve this goal by: (1) exploiting semantic evidence for possible matches; (2) finding a globally optimal match assignment; (3) identifying the relationship embedded in the selected matches. We implemented our semantic matching approach within a prototype system, SeMap, and showed its accuracy and effectiveness.},
language = {en},
author = {Wang, Ting and Pottinger, Rachel},
pages = {12},
file = {Wang and Pottinger - SeMap A Generic Mapping Construction System.pdf:/home/fmind/Documents/Zotero/storage/HI8AIKC3/Wang and Pottinger - SeMap A Generic Mapping Construction System.pdf:application/pdf}
}
@article{ganjam_robust_2003,
title = {Robust and {Efficient} {Fuzzy} {Match} for {Online} {Data} {Cleaning}},
abstract = {To ensure high data quality, data warehouses must validate and cleanse incoming data tuples from external sources. In many situations, clean tuples must match acceptable tuples in reference tables. For example, product name and description fields in a sales record from a distributor must match the pre-recorded name and description fields in a product reference relation.},
language = {en},
author = {Ganjam, Surajit Chaudhuri Kris and Ganti, Venkatesh and Motwani, Rajeev},
month = jan,
year = {2003},
pages = {12},
file = {Ganjam et al. - Robust and Efficient Fuzzy Match for Online Data C.pdf:/home/fmind/Documents/Zotero/storage/4Q3JMF2Y/Ganjam et al. - Robust and Efficient Fuzzy Match for Online Data C.pdf:application/pdf}
}
@article{newcombe_record_1962,
title = {Record linkage: making maximum use of the discriminating power of identifying information},
volume = {5},
issn = {00010782},
shorttitle = {Record linkage},
url = {http://portal.acm.org/citation.cfm?doid=368996.369026},
doi = {10.1145/368996.369026},
language = {en},
number = {11},
urldate = {2018-04-10},
journal = {Communications of the ACM},
author = {Newcombe, Howard B. and Kennedy, James M.},
month = nov,
year = {1962},
pages = {563--566},
file = {Newcombe and Kennedy - 1962 - Record linkage making maximum use of the discrimi.pdf:/home/fmind/Documents/Zotero/storage/XEVMCY84/Newcombe and Kennedy - 1962 - Record linkage making maximum use of the discrimi.pdf:application/pdf}
}
@article{raykar_ranking_2011,
title = {Ranking annotators for crowdsourced labeling tasks},
abstract = {With the advent of crowdsourcing services it has become quite cheap and reasonably effective to get a dataset labeled by multiple annotators in a short amount of time. Various methods have been proposed to estimate the consensus labels by correcting for the bias of annotators with different kinds of expertise. Often we have low quality annotators or spammers–annotators who assign labels randomly (e.g., without actually looking at the instance). Spammers can make the cost of acquiring labels very expensive and can potentially degrade the quality of the consensus labels. In this paper we formalize the notion of a spammer and define a score which can be used to rank the annotators—with the spammers having a score close to zero and the good annotators having a high score close to one.},
language = {en},
author = {Raykar, Vikas C and Yu, Shipeng},
year = {2011},
pages = {9},
file = {Raykar and Yu - Ranking annotators for crowdsourced labeling tasks.pdf:/home/fmind/Documents/Zotero/storage/SZPQ38JR/Raykar and Yu - Ranking annotators for crowdsourced labeling tasks.pdf:application/pdf}
}
@inproceedings{sheng_get_2008,
title = {Get another label? improving data quality and data mining using multiple, noisy labelers},
isbn = {978-1-60558-193-4},
shorttitle = {Get another label?},
url = {http://dl.acm.org/citation.cfm?doid=1401890.1401965},
doi = {10.1145/1401890.1401965},
abstract = {This paper addresses the repeated acquisition of labels for data items when the labeling is imperfect. We examine the improvement (or lack thereof) in data quality via repeated labeling, and focus especially on the improvement of training labels for supervised induction. With the outsourcing of small tasks becoming easier, for example via Rent-A-Coder or Amazon’s Mechanical Turk, it often is possible to obtain less-than-expert labeling at low cost. With low-cost labeling, preparing the unlabeled part of the data can become considerably more expensive than labeling. We present repeated-labeling strategies of increasing complexity, and show several main results. (i) Repeated-labeling can improve label quality and model quality, but not always. (ii) When labels are noisy, repeated labeling can be preferable to single labeling even in the traditional setting where labels are not particularly cheap. (iii) As soon as the cost of processing the unlabeled data is not free, even the simple strategy of labeling everything multiple times can give considerable advantage. (iv) Repeatedly labeling a carefully chosen set of points is generally preferable, and we present a robust technique that combines different notions of uncertainty to select data points for which quality should be improved. The bottom line: the results show clearly that when labeling is not perfect, selective acquisition of multiple labels is a strategy that data miners should have in their repertoire; for certain label-quality/cost regimes, the benefit is substantial.},
language = {en},
urldate = {2018-04-10},
publisher = {ACM Press},
author = {Sheng, Victor S. and Provost, Foster and Ipeirotis, Panagiotis G.},
year = {2008},
pages = {614},
file = {Sheng et al. - 2008 - Get another label improving data quality and data.pdf:/home/fmind/Documents/Zotero/storage/YU5Z46CL/Sheng et al. - 2008 - Get another label improving data quality and data.pdf:application/pdf}
}
@article{madhavan_generic_2011,
title = {Generic {Schema} {Matching} with {Cupid}},
abstract = {Schema matching is a critical step in many applications, such as XML message mapping, data warehouse loading, and schema integration. In this paper, we investigate algorithms for generic schema matching, outside of any particular data model or application. We first present a taxonomy for past solutions, showing that a rich range of techniques is available. We then propose a new algorithm, Cupid, that discovers mappings between schema elements based on their names, data types, constraints, and schema structure, using a broader set of techniques than past approaches. Some of our innovations are the integrated use of linguistic and structural matching, context-dependent matching of shared types, and a bias toward leaf structure where much of the schema content resides. After describing our algorithm, we present experimental results that compare Cupid to two other schema matching systems.},
language = {en},
author = {Madhavan, Jayant and Bernstein, Philip A and Rahm, Erhard},
year = {2011},
pages = {15},
file = {Madhavan et al. - Generic Schema Matching with Cupid.pdf:/home/fmind/Documents/Zotero/storage/FNAIPZRT/Madhavan et al. - Generic Schema Matching with Cupid.pdf:application/pdf}
}
@article{bernstein_generic_2011,
title = {Generic {Schema} {Matching}, {Ten} {Years} {Later}},
abstract = {In a paper published in the 2001 VLDB Conference, we proposed treating generic schema matching as an independent problem. We developed a taxonomy of existing techniques, a new schema matching algorithm, and an approach to comparative evaluation. Since then, the field has grown into a major research topic. We briefly summarize the new techniques that have been developed and applications of the techniques in the commercial world. We conclude by discussing future trends and recommendations for further work.},
language = {en},
author = {Bernstein, Philip A and Madhavan, Jayant and Rahm, Erhard},
year = {2011},
pages = {7},
file = {Bernstein et al. - Generic Schema Matching, Ten Years Later.pdf:/home/fmind/Documents/Zotero/storage/AEYITLWA/Bernstein et al. - Generic Schema Matching, Ten Years Later.pdf:application/pdf}
}
@incollection{goos_discovering_1999,
address = {Berlin, Heidelberg},
title = {Discovering {Frequent} {Closed} {Itemsets} for {Association} {Rules}},
volume = {1540},
isbn = {978-3-540-65452-0 978-3-540-49257-3},
url = {http://link.springer.com/10.1007/3-540-49257-7_25},
abstract = {In this paper, we address the problem of nding frequent itemsets in a database. Using the closed itemset lattice framework, we show that this problem can be reduced to the problem of nding frequent closed itemsets. Based on this statement, we can construct e cient data mining algorithms by limiting the search space to the closed itemset lattice rather than the subset lattice. Moreover, we show that the set of all frequent closed itemsets su ces to determine a reduced set of association rules, thus addressing another important data mining problem: limiting the number of rules produced without information loss. We propose a new algorithm, called A-Close, using a closure mechanism to nd frequent closed itemsets. We realized experiments to compare our approach to the commonly used frequent itemset search approach. Those experiments showed that our approach is very valuable for dense and/or correlated data that represent an important part of existing databases.},
language = {en},
urldate = {2018-04-10},
booktitle = {Database {Theory} — {ICDT}’99},
publisher = {Springer Berlin Heidelberg},
author = {Pasquier, Nicolas and Bastide, Yves and Taouil, Rafik and Lakhal, Lotfi},
editor = {Goos, Gerhard and Hartmanis, Juris and van Leeuwen, Jan and Beeri, Catriel and Buneman, Peter},
year = {1999},
doi = {10.1007/3-540-49257-7_25},
pages = {398--416},
file = {Pasquier et al. - 1999 - Discovering Frequent Closed Itemsets for Associati.pdf:/home/fmind/Documents/Zotero/storage/GTC3VGR2/Pasquier et al. - 1999 - Discovering Frequent Closed Itemsets for Associati.pdf:application/pdf}
}
@inproceedings{ramakrishnan_model_2005,
title = {A model for handling approximate, noisy or incomplete labeling in text classification},
isbn = {978-1-59593-180-1},
url = {http://portal.acm.org/citation.cfm?doid=1102351.1102437},
doi = {10.1145/1102351.1102437},
abstract = {We introduce a Bayesian model, BayesANIL, that is capable of estimating uncertainties associated with the labeling process. Given a labeled or partially labeled training corpus of text documents, the model estimates the joint distribution of training documents and class labels by using a generalization of the Expectation Maximization algorithm. The estimates can be used in standard classification models to reduce error rates. Since uncertainties in the labeling are taken into account, the model provides an elegant mechanism to deal with noisy labels. We provide an intuitive modification to the EM iterations by re-estimating the empirical distribution in order to reinforce feature values in unlabeled data and to reduce the influence of noisily labeled examples. Considerable improvement in the classification accuracies of two popular classification algorithms on standard labeled data-sets with and without artificially introduced noise, as well as in the presence and absence of unlabeled data, indicates that this may be a promising method to reduce the burden of manual labeling.},
language = {en},
urldate = {2018-04-10},
publisher = {ACM Press},
author = {Ramakrishnan, Ganesh and Chitrapura, Krishna Prasad and Krishnapuram, Raghu and Bhattacharyya, Pushpak},
year = {2005},
pages = {681--688},
file = {Ramakrishnan et al. - 2005 - A model for handling approximate, noisy or incompl.pdf:/home/fmind/Documents/Zotero/storage/PKJXVJQ5/Ramakrishnan et al. - 2005 - A model for handling approximate, noisy or incompl.pdf:application/pdf}
}
@article{serenko_agent_2002,
title = {{AGENT} {TOOLKITS}: {A} {GENERAL} {OVERVIEW} {OF} {THE} {MARKET} {AND} {AN} {ASSESSMENT} {OF} {INSTRUCTOR} {SATISFACTION} {WITH} {UTILIZING} {TOOLKITS} {IN} {THE} {CLASSROOM}},
language = {en},
author = {Serenko, Alexander and Detlor, Brian},
month = jul,
year = {2002},
pages = {49},
file = {Serenko and Detlor - AGENT TOOLKITS A GENERAL OVERVIEW OF THE MARKET A.pdf:/home/fmind/Documents/Zotero/storage/SCUD56QP/Serenko and Detlor - AGENT TOOLKITS A GENERAL OVERVIEW OF THE MARKET A.pdf:application/pdf;Serenko and Detlor - AGENT TOOLKITS A GENERAL OVERVIEW OF THE MARKET A.pdf:/home/fmind/Documents/Zotero/storage/DZTW7TCV/Serenko and Detlor - AGENT TOOLKITS A GENERAL OVERVIEW OF THE MARKET A.pdf:application/pdf}
}
@inproceedings{kemp_problems_1993,
title = {Problems in expert systems development},
isbn = {978-0-8186-4260-9},
url = {http://ieeexplore.ieee.org/document/323053/},
doi = {10.1109/ANNES.1993.323053},
language = {en},
urldate = {2018-04-10},
publisher = {IEEE Comput. Soc. Press},
author = {Kemp, E.A.},
year = {1993},
pages = {166--167},
file = {Kemp - 1993 - Problems in expert systems development.pdf:/home/fmind/Documents/Zotero/storage/54MCTFH9/Kemp - 1993 - Problems in expert systems development.pdf:application/pdf}
}
@inproceedings{russell_learning_1998,
title = {Learning agents for uncertain environments (extended abstract)},
isbn = {978-1-58113-057-7},
url = {http://portal.acm.org/citation.cfm?doid=279943.279964},
doi = {10.1145/279943.279964},
abstract = {This talk proposes a very simple “baseline architecture” for a learning agent that can handle stochastic, partially observable environments. The architecture uses reinforcement learning together with a method for representing temporal processes as graphical models. I will discuss methods for learning the parameters and structure of such representations from sensory inputs, and for computing posterior probabilities. Some open problems remain before we can try out the complete agent; more arise when we consider scaling up.},
language = {en},
urldate = {2018-04-10},
publisher = {ACM Press},
author = {Russell, Stuart},
year = {1998},
pages = {101--103},
file = {Russell - 1998 - Learning agents for uncertain environments (extend.pdf:/home/fmind/Documents/Zotero/storage/I6AM7VUT/Russell - 1998 - Learning agents for uncertain environments (extend.pdf:application/pdf}
}
@article{mnih_human-level_2015,
title = {Human-level control through deep reinforcement learning},
volume = {518},
issn = {0028-0836, 1476-4687},
url = {http://www.nature.com/articles/nature14236},
doi = {10.1038/nature14236},
language = {en},
number = {7540},
urldate = {2018-04-10},
journal = {Nature},
author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A. and Veness, Joel and Bellemare, Marc G. and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K. and Ostrovski, Georg and Petersen, Stig and Beattie, Charles and Sadik, Amir and Antonoglou, Ioannis and King, Helen and Kumaran, Dharshan and Wierstra, Daan and Legg, Shane and Hassabis, Demis},
month = feb,
year = {2015},
pages = {529--533},
file = {Mnih et al. - 2015 - Human-level control through deep reinforcement lea.pdf:/home/fmind/Documents/Zotero/storage/HQ9AYK5F/Mnih et al. - 2015 - Human-level control through deep reinforcement lea.pdf:application/pdf}
}
@article{laukien_feynman_2016,
title = {Feynman {Machine}: {The} {Universal} {Dynamical} {Systems} {Computer}},
abstract = {Efforts at understanding the computational processes in the brain have met with limited success, despite their importance and potential uses in building intelligent machines. We propose a simple new model which draws on recent findings in Neuroscience and the Applied Mathematics of interacting Dynamical Systems. The Feynman Machine is a Universal Computer for Dynamical Systems, analogous to the Turing Machine for symbolic computing, but with several important differences. We demonstrate that networks and hierarchies of simple interacting Dynamical Systems, each adaptively learning to forecast its evolution, are capable of automatically building sensorimotor models of the external and internal world. We identify such networks in mammalian neocortex, and show how existing theories of cortical computation combine with our model to explain the power and flexibility of mammalian intelligence. These findings lead directly to new architectures for machine intelligence. A suite of software implementations has been built based on these principles, and applied to a number of spatiotemporal learning tasks.},
language = {en},
author = {Laukien, Eric and Crowder, Richard and Byrne, Fergal},
month = sep,
year = {2016},
pages = {28},
file = {Laukien et al. - Feynman Machine The Universal Dynamical Systems C.pdf:/home/fmind/Documents/Zotero/storage/QKDM8WS6/Laukien et al. - Feynman Machine The Universal Dynamical Systems C.pdf:application/pdf}
}
@article{kravari_survey_2015,
title = {A {Survey} of {Agent} {Platforms}},
volume = {18},
issn = {1460-7425},
url = {http://jasss.soc.surrey.ac.uk/18/1/11.html},
doi = {10.18564/jasss.2661},
abstract = {From computer games to human societies, many natural and artificial phenomena can be represented as multi-agent systems. Over time, these systems have been proven a really powerful tool for modelling and understanding phenomena in fields, such as economics and trading, health care, urban planning and social sciences. However, although, intelligent agents have been around for years, their actual implementation is still in its early stages. Since the late nineties many agent platforms have been developed. Some of them have already been abandoned whereas others continue releasing new versions. On the other hand, the agent-oriented research community is still providing more and more new platforms. This vast amount of platform options leads to a high degree of heterogeneity. Hence, a common problem is how people interested in using multi-agent systems should choose which platform to use in order to benefit from agent technology. This decision was usually left to word of mouth, past experiences or platform publicity, lately however people depend on solid survey articles. To date, in most cases multiagent system surveys describe only the basic characteristics of a few representatives without even providing any classification of the systems themselves. This article presents a comparative up-to-date review of the most promising existing agent platforms that can be used. It is based on universal comparison and evaluation criteria, proposing classifications for helping readers to understand which agent platforms broadly exhibit similar properties and in which situations which choices should be made.},
language = {en},
number = {1},
urldate = {2018-04-10},
journal = {Journal of Artificial Societies and Social Simulation},
author = {Kravari, Kalliopi and Bassiliades, Nick},
year = {2015},
file = {Kravari and Bassiliades - 2015 - A Survey of Agent Platforms.pdf:/home/fmind/Documents/Zotero/storage/FIVUWFV5/Kravari and Bassiliades - 2015 - A Survey of Agent Platforms.pdf:application/pdf}
}
@inproceedings{abbeel_apprenticeship_2004,
title = {Apprenticeship learning via inverse reinforcement learning},
url = {http://portal.acm.org/citation.cfm?doid=1015330.1015430},
doi = {10.1145/1015330.1015430},
abstract = {We consider learning in a Markov decision process where we are not explicitly given a reward function, but where instead we can observe an expert demonstrating the task that we want to learn to perform. This setting is useful in applications (such as the task of driving) where it may be difficult to write down an explicit reward function specifying exactly how different desiderata should be traded off. We think of the expert as trying to maximize a reward function that is expressible as a linear combination of known features, and give an algorithm for learning the task demonstrated by the expert. Our algorithm is based on using “inverse reinforcement learning” to try to recover the unknown reward function. We show that our algorithm terminates in a small number of iterations, and that even though we may never recover the expert’s reward function, the policy output by the algorithm will attain performance close to that of the expert, where here performance is measured with respect to the expert’s unknown reward function.},
language = {en},
urldate = {2018-04-10},
publisher = {ACM Press},
author = {Abbeel, Pieter and Ng, Andrew Y.},
year = {2004},
pages = {1},
file = {Abbeel and Ng - 2004 - Apprenticeship learning via inverse reinforcement .pdf:/home/fmind/Documents/Zotero/storage/TCMT4493/Abbeel and Ng - 2004 - Apprenticeship learning via inverse reinforcement .pdf:application/pdf}
}
@inproceedings{chen_xgboost:_2016,
title = {{XGBoost}: {A} {Scalable} {Tree} {Boosting} {System}},
isbn = {978-1-4503-4232-2},
shorttitle = {{XGBoost}},
url = {http://dl.acm.org/citation.cfm?doid=2939672.2939785},
doi = {10.1145/2939672.2939785},
abstract = {Tree boosting is a highly effective and widely used machine learning method. In this paper, we describe a scalable endto-end tree boosting system called XGBoost, which is used widely by data scientists to achieve state-of-the-art results on many machine learning challenges. We propose a novel sparsity-aware algorithm for sparse data and weighted quantile sketch for approximate tree learning. More importantly, we provide insights on cache access patterns, data compression and sharding to build a scalable tree boosting system. By combining these insights, XGBoost scales beyond billions of examples using far fewer resources than existing systems.},
language = {en},
urldate = {2018-04-10},
publisher = {ACM Press},
author = {Chen, Tianqi and Guestrin, Carlos},
year = {2016},
pages = {785--794},
file = {Chen and Guestrin - 2016 - XGBoost A Scalable Tree Boosting System.pdf:/home/fmind/Documents/Zotero/storage/NA3J5I3L/Chen and Guestrin - 2016 - XGBoost A Scalable Tree Boosting System.pdf:application/pdf}
}
@article{mitchell_need_1980,
title = {The {Need} for {Biases} in {Learning} {Generalizations}},
language = {en},
author = {Mitchell, Tom M},
year = {1980},
pages = {3},
file = {Mitchell - The Need for Biases in Learning Generalizations.pdf:/home/fmind/Documents/Zotero/storage/YEJHM622/Mitchell - The Need for Biases in Learning Generalizations.pdf:application/pdf}
}
@article{lipton_mythos_2016,
title = {The {Mythos} of {Model} {Interpretability}},
abstract = {Supervised machine learning models boast remarkable predictive capabilities. But can you trust your model? Will it work in deployment? What else can it tell you about the world? We want models to be not only good, but interpretable. And yet the task of interpretation appears underspecified. Papers provide diverse and sometimes non-overlapping motivations for interpretability, and offer myriad notions of what attributes render models interpretable. Despite this ambiguity, many papers proclaim interpretability axiomatically, absent further explanation. In this paper, we seek to refine the discourse on interpretability. First, we examine the motivations underlying interest in interpretability, finding them to be diverse and occasionally discordant. Then, we address model properties and techniques thought to confer interpretability, identifying transparency to humans and post-hoc explanations as competing notions. Throughout, we discuss the feasibility and desirability of different notions, and question the oft-made assertions that linear models are interpretable and that deep neural networks are not.},
language = {en},
author = {Lipton, Zachary C},
month = jun,
year = {2016},
pages = {5},
file = {Lipton - The Mythos of Model Interpretability.pdf:/home/fmind/Documents/Zotero/storage/JN9VXRRP/Lipton - The Mythos of Model Interpretability.pdf:application/pdf}
}
@article{xu_survey_2005,
title = {Survey of {Clustering} {Algorithms}},
volume = {16},
issn = {1045-9227},
url = {http://ieeexplore.ieee.org/document/1427769/},
doi = {10.1109/TNN.2005.845141},
abstract = {Data analysis plays an indispensable role for understanding various phenomena. Cluster analysis, primitive exploration with little or no prior knowledge, consists of research developed across a wide variety of communities. The diversity, on one hand, equips us with many tools. On the other hand, the profusion of options causes confusion. We survey clustering algorithms for data sets appearing in statistics, computer science, and machine learning, and illustrate their applications in some benchmark data sets, the traveling salesman problem, and bioinformatics, a new field attracting intensive efforts. Several tightly related topics, proximity measure, and cluster validation, are also discussed.},
language = {en},
number = {3},
urldate = {2018-04-10},
journal = {IEEE Transactions on Neural Networks},
author = {Xu, R. and WunschII, D.},
month = may,
year = {2005},
pages = {645--678},
file = {Xu and WunschII - 2005 - Survey of Clustering Algorithms.pdf:/home/fmind/Documents/Zotero/storage/TPHYWA6N/Xu and WunschII - 2005 - Survey of Clustering Algorithms.pdf:application/pdf}
}
@article{kotsiantis_supervised_2007,
title = {Supervised {Machine} {Learning}: {A} {Review} of {Classification} {Techniques}},
language = {en},
author = {Kotsiantis, S B},
month = jul,
year = {2007},
pages = {20},
file = {Kotsiantis - Supervised Machine Learning A Review of Classific.pdf:/home/fmind/Documents/Zotero/storage/H7MF7E7F/Kotsiantis - Supervised Machine Learning A Review of Classific.pdf:application/pdf}
}
@article{lin_power_2010,
title = {Power {Iteration} {Clustering}},
abstract = {We present a simple and scalable graph clustering method called power iteration clustering (PIC). PIC finds a very low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise similarity matrix of the data. This embedding turns out to be an effective cluster indicator, consistently outperforming widely used spectral methods such as NCut on real datasets. PIC is very fast on large datasets, running over 1,000 times faster than an NCut implementation based on the state-of-the-art IRAM eigenvector computation technique.},
language = {en},
author = {Lin, Frank and Cohen, William W},
year = {2010},
pages = {8},
file = {Lin and Cohen - Power Iteration Clustering.pdf:/home/fmind/Documents/Zotero/storage/T2WD7PV6/Lin and Cohen - Power Iteration Clustering.pdf:application/pdf}
}
@article{farber_using_2010,
title = {On {Using} {Class}-{Labels} in {Evaluation} of {Clusterings}},
abstract = {Although clustering has been studied for several decades, the fundamental problem of a valid evaluation has not yet been solved. The sound evaluation of clustering results in particular on real data is inherently difficult. In the literature, new clustering algorithms and their results are often externally evaluated with respect to an existing class labeling. These class-labels, however, may not be adequate for the structure of the data or the evaluated cluster model. Here, we survey the literature of different related research areas that have observed this problem. We discuss common “defects” that clustering algorithms exhibit w.r.t. this evaluation, and show them on several real world data sets of different domains along with a discussion why the detected clusters do not indicate a bad performance of the algorithm but are valid and useful results. An useful alternative evaluation method requires more extensive data labeling than the commonly used class labels or it needs a combination of information measures to take subgroups, supergroups, and overlapping sets of traditional classes into account. Finally, we discuss an evaluation scenario that regards the possible existence of several complementary sets of labels and hope to stimulate the discussion among different sub-communities — like ensemble-clustering, subspace-clustering, multi-label classification, hierarchical classification or hierarchical clustering, and multiview-clustering or alternative clustering —regarding requirements on enhanced evaluation methods.},
language = {en},
author = {Färber, Ines and Günnemann, Stephan and Kriegel, Hans-Peter and Kröger, Peer and Müller, Emmanuel and Schubert, Erich and Seidl, Thomas and Zimek, Arthur},
year = {2010},
pages = {9},
file = {Färber et al. - On Using Class-Labels in Evaluation of Clusterings.pdf:/home/fmind/Documents/Zotero/storage/ZM7M9JYR/Färber et al. - On Using Class-Labels in Evaluation of Clusterings.pdf:application/pdf}
}
@article{halkidi_clustering_2001,
title = {On {Clustering} {Validation} {Techniques}},
abstract = {Cluster analysis aims at identifying groups of similar objects and, therefore helps to discover distribution of patterns and interesting correlations in large data sets. It has been subject of wide research since it arises in many application domains in engineering, business and social sciences. Especially, in the last years the availability of huge transactional and experimental data sets and the arising requirements for data mining created needs for clustering algorithms that scale and can be applied in diverse domains.},
language = {en},
author = {HALKIDI, MARIA},
year = {2001},
pages = {39},
file = {HALKIDI - On Clustering Validation Techniques.pdf:/home/fmind/Documents/Zotero/storage/J5HN5X6X/HALKIDI - On Clustering Validation Techniques.pdf:application/pdf}
}
@inproceedings{goldberg_measuring_2010,
title = {Measuring {Similarity} between {Sets} of {Overlapping} {Clusters}},
isbn = {978-1-4244-8439-3},
url = {http://ieeexplore.ieee.org/document/5591225/},
doi = {10.1109/SocialCom.2010.50},
abstract = {The typical task of unsupervised learning is to organize data, for example into clusters, typically disjoint clusters (eg. the K-means algorithm). One would expect (for example) a clustering of books into topics to present overlapping clusters. The situation is even more so in social networks, a source of ever increasing data. Finding the groups or communities in social networks based on interactions between individuals (a measure of similarity) is an unsupervised learning task; and, groups overlap – an individual can be a chess player and a violin player, in which case he would interact with members of both these groups.},
language = {en},
urldate = {2018-04-10},
publisher = {IEEE},
author = {Goldberg, Mark K. and Hayvanovych, Mykola and Magdon-Ismail, Malik},
month = aug,
year = {2010},
pages = {303--308},
file = {Goldberg et al. - 2010 - Measuring Similarity between Sets of Overlapping C.pdf:/home/fmind/Documents/Zotero/storage/JZ9KZMX9/Goldberg et al. - 2010 - Measuring Similarity between Sets of Overlapping C.pdf:application/pdf}
}
@article{kolter_linear_2008,
title = {Linear {Algebra} {Review} and {Reference}},
language = {en},
author = {Kolter, Zico and Do, Chuong},
month = oct,
year = {2008},
pages = {26},
file = {Kolter and Do - Linear Algebra Review and Reference.pdf:/home/fmind/Documents/Zotero/storage/G4T7IL7U/Kolter and Do - Linear Algebra Review and Reference.pdf:application/pdf}
}
@article{blei_latent_2016,
title = {Latent {Dirichlet} {Allocation}},
abstract = {We describe latent Dirichlet allocation (LDA), a generative probabilistic model for collections of discrete data such as text corpora. LDA is a three-level hierarchical Bayesian model, in which each item of a collection is modeled as a finite mixture over an underlying set of topics. Each topic is, in turn, modeled as an infinite mixture over an underlying set of topic probabilities. In the context of text modeling, the topic probabilities provide an explicit representation of a document. We present efficient approximate inference techniques based on variational methods and an EM algorithm for empirical Bayes parameter estimation. We report results in document modeling, text classification, and collaborative filtering, comparing to a mixture of unigrams model and the probabilistic LSI model.},
language = {en},
author = {Blei, David M},
year = {2016},
pages = {30},
file = {Blei - Latent Dirichlet Allocation.pdf:/home/fmind/Documents/Zotero/storage/QWPQJ4YM/Blei - Latent Dirichlet Allocation.pdf:application/pdf}
}
@article{rendon_internal_2011,
title = {Internal versus {External} cluster validation indexes},
volume = {5},
abstract = {One of fundamental challenges of clustering is how to evaluate results, without auxiliary information. A common approach for evaluation of clustering results is to use validity indexes. Clustering validity approaches can use three criteria: External criteria (evaluate the result with respect to a pre-specified structure), internal criteria (evaluate the result with respect a information intrinsic to the data alone). Consequently, different types of indexes are used to solve different types of problems and indexes selection depends on the kind of available information. That is why in this paper we show a comparison between external and internal indexes. Results obtained in this study indicate that internal indexes are more accurate in group determining in a given clustering structure. Six internal indexes were used in this study: BIC, CH, DB, SIL, NIVA and DUNN and four external indexes (F-measure, NMIMeasure, Entropy, Purity). The clusters that were used were obtained through clustering algorithms K-means and Bissecting-Kmeans.},
language = {en},
number = {1},
author = {Rendón, Eréndira and Abundez, Itzel and Arizmendi, Alejandra and Quiroz, Elvia M},
year = {2011},
pages = {8},
file = {Rendón et al. - 2011 - Internal versus External cluster validation indexe.pdf:/home/fmind/Documents/Zotero/storage/RYTLXQKT/Rendón et al. - 2011 - Internal versus External cluster validation indexe.pdf:application/pdf}
}
@article{lorenz_how_2011,
title = {How social influence can undermine the wisdom of crowd effect},
volume = {108},
issn = {0027-8424, 1091-6490},
url = {http://www.pnas.org/cgi/doi/10.1073/pnas.1008636108},
doi = {10.1073/pnas.1008636108},
language = {en},
number = {22},
urldate = {2018-04-10},
journal = {Proceedings of the National Academy of Sciences},
author = {Lorenz, J. and Rauhut, H. and Schweitzer, F. and Helbing, D.},
month = may,
year = {2011},
pages = {9020--9025},
file = {Lorenz et al. - 2011 - How social influence can undermine the wisdom of c.pdf:/home/fmind/Documents/Zotero/storage/5GPX7WQ8/Lorenz et al. - 2011 - How social influence can undermine the wisdom of c.pdf:application/pdf}
}
@inproceedings{kanter_deep_2015,
title = {Deep feature synthesis: {Towards} automating data science endeavors},
isbn = {978-1-4673-8272-4},
shorttitle = {Deep feature synthesis},
url = {http://ieeexplore.ieee.org/document/7344858/},
doi = {10.1109/DSAA.2015.7344858},
abstract = {In this paper, we develop the Data Science Machine, which is able to derive predictive models from raw data automatically. To achieve this automation, we first propose and develop the Deep Feature Synthesis algorithm for automatically generating features for relational datasets. The algorithm follows relationships in the data to a base field, and then sequentially applies mathematical functions along that path to create the final feature. Second, we implement a generalizable machine learning pipeline and tune it using a novel Gaussian Copula process based approach. We entered the Data Science Machine in 3 data science competitions that featured 906 other data science teams. Our approach beats 615 teams in these data science competitions. In 2 of the 3 competitions we beat a majority of competitors, and in the third, we achieved 94\% of the best competitor’s score. In the best case, with an ongoing competition, we beat 85.6\% of the teams and achieved 95.7\% of the top submissions score.},
language = {en},
urldate = {2018-04-10},
publisher = {IEEE},
author = {Kanter, James Max and Veeramachaneni, Kalyan},
month = oct,
year = {2015},
pages = {1--10},
file = {Kanter and Veeramachaneni - 2015 - Deep feature synthesis Towards automating data sc.pdf:/home/fmind/Documents/Zotero/storage/5AZFYQ85/Kanter and Veeramachaneni - 2015 - Deep feature synthesis Towards automating data sc.pdf:application/pdf}
}
@article{liao_data_2012,
title = {Data mining techniques and applications – {A} decade review from 2000 to 2011},
volume = {39},
issn = {09574174},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0957417412003077},
doi = {10.1016/j.eswa.2012.02.063},
abstract = {In order to determine how data mining techniques (DMT) and their applications have developed, during the past decade, this paper reviews data mining techniques and their applications and development, through a survey of literature and the classification of articles, from 2000 to 2011. Keyword indices and article abstracts were used to identify 216 articles concerning DMT applications, from 159 academic journals (retrieved from five online databases), this paper surveys and classifies DMT, with respect to the following three areas: knowledge types, analysis types, and architecture types, together with their applications in different research and practical domains. A discussion deals with the direction of any future developments in DMT methodologies and applications: (1) DMT is finding increasing applications in expertise orientation and the development of applications for DMT is a problem-oriented domain. (2) It is suggested that different social science methodologies, such as psychology, cognitive science and human behavior might implement DMT, as an alternative to the methodologies already on offer. (3) The ability to continually change and acquire new understanding is a driving force for the application of DMT and this will allow many new future applications.},
language = {en},
number = {12},
urldate = {2018-04-10},
journal = {Expert Systems with Applications},
author = {Liao, Shu-Hsien and Chu, Pei-Hui and Hsiao, Pei-Yuan},
month = sep,
year = {2012},
pages = {11303--11311},
file = {Liao et al. - 2012 - Data mining techniques and applications – A decade.pdf:/home/fmind/Documents/Zotero/storage/5E4T8VJ7/Liao et al. - 2012 - Data mining techniques and applications – A decade.pdf:application/pdf}
}
@article{jain_data_1999,
title = {Data clustering: a review},
volume = {31},
issn = {03600300},
shorttitle = {Data clustering},
url = {http://portal.acm.org/citation.cfm?doid=331499.331504},
doi = {10.1145/331499.331504},
language = {en},
number = {3},
urldate = {2018-04-10},
journal = {ACM Computing Surveys},
author = {Jain, A. K. and Murty, M. N. and Flynn, P. J.},
month = sep,
year = {1999},
pages = {264--323},
file = {Jain et al. - 1999 - Data clustering a review.pdf:/home/fmind/Documents/Zotero/storage/C9DQVQIB/Jain et al. - 1999 - Data clustering a review.pdf:application/pdf}
}
@article{meila_comparing_2007,
title = {Comparing clusterings—an information based distance},
volume = {98},
issn = {0047259X},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0047259X06002016},
doi = {10.1016/j.jmva.2006.11.013},
abstract = {This paper proposes an information theoretic criterion for comparing two partitions, or clusterings, of the same data set. The criterion, called variation of information (VI), measures the amount of information lost and gained in changing from clustering C to clustering C . The basic properties of VI are presented and discussed. We focus on two kinds of properties: (1) those that help one build intuition about the new criterion (in particular, it is shown the VI is a true metric on the space of clusterings), and (2) those that pertain to the comparability of VI values over different experimental conditions. As the latter properties have rarely been discussed explicitly before, other existing comparison criteria are also examined in their light. Finally we present the VI from an axiomatic point of view, showing that it is the only “sensible” criterion for comparing partitions that is both aligned to the lattice and convexely additive. As a consequence, we prove an impossibility result for comparing partitions: there is no criterion for comparing partitions that simultaneously satisfies the above two desirable properties and is bounded.},
language = {en},
number = {5},
urldate = {2018-04-10},
journal = {Journal of Multivariate Analysis},
author = {Meilă, Marina},
month = may,
year = {2007},
pages = {873--895},
file = {Meilă - 2007 - Comparing clusterings—an information based distanc.pdf:/home/fmind/Documents/Zotero/storage/TQZMDAJA/Meilă - 2007 - Comparing clusterings—an information based distanc.pdf:application/pdf}
}
@inproceedings{bo_long_combining_2005,
title = {Combining {Multiple} {Clusterings} by {Soft} {Correspondence}},
isbn = {978-0-7695-2278-4},
url = {http://ieeexplore.ieee.org/document/1565690/},
doi = {10.1109/ICDM.2005.45},
abstract = {Combining multiple clusterings arises in various important data mining scenarios. However, finding a consensus clustering from multiple clusterings is a challenging task because there is no explicit correspondence between the classes from different clusterings. We present a new framework based on soft correspondence to directly address the correspondence problem in combining multiple clusterings. Under this framework, we propose a novel algorithm that iteratively computes the consensus clustering and correspondence matrices using multiplicative updating rules. This algorithm provides a final consensus clustering as well as correspondence matrices that gives intuitive interpretation of the relations between the consensus clustering and each clustering from clustering ensembles. Extensive experimental evaluations also demonstrate the effectiveness and potential of this framework as well as the algorithm for discovering a consensus clustering from multiple clusterings.},
language = {en},
urldate = {2018-04-10},
publisher = {IEEE},
author = {{Bo Long} and {Zhongfei Zhang} and Yu, P.S.},
year = {2005},
pages = {282--289},
file = {Bo Long et al. - 2005 - Combining Multiple Clusterings by Soft Corresponde.pdf:/home/fmind/Documents/Zotero/storage/TCWA398G/Bo Long et al. - 2005 - Combining Multiple Clusterings by Soft Corresponde.pdf:application/pdf}
}
@article{fowlkes_method_1983,
title = {A {Method} for {Comparing} {Two} {Hierarchical} {Clusterings}},
volume = {78},
issn = {01621459},
url = {https://www.jstor.org/stable/2288117?origin=crossref},
doi = {10.2307/2288117},
language = {en},
number = {383},
urldate = {2018-04-10},
journal = {Journal of the American Statistical Association},
author = {Fowlkes, E. B. and Mallows, C. L.},
month = sep,
year = {1983},
pages = {553},
file = {Fowlkes and Mallows - 1983 - A Method for Comparing Two Hierarchical Clustering.pdf:/home/fmind/Documents/Zotero/storage/ZYXRC3A2/Fowlkes and Mallows - 1983 - A Method for Comparing Two Hierarchical Clustering.pdf:application/pdf}