forked from jeffsteinmetz/pyes2csv
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpyes2csv.py
59 lines (52 loc) · 2.29 KB
/
pyes2csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# Sample script by Jeff Steinmetz, Twitter: @jeffsteinmetz
# MIT License
# V1.0, created 2014-01-17
#
# Requires the Python Elasticsearch client
# http://www.elasticsearch.org/blog/unleash-the-clients-ruby-python-php-perl/#python
#
import elasticsearch
import csv
import random
import unicodedata
#replace with the IP address of your Elasticsearch node
es = elasticsearch.Elasticsearch(["10.1.1.1:9200"])
# Replace the following Query with your own Elastic Search Query
res = es.search(index="YourIndexName", body=
{"query":
{
"bool": {
"must": [
{"match": { "search_field":"search string"}}
, {"range": { "created_at": {"gte":"2013-10-01", "lte":"2014-01-29" }}},
, {"match": { "another_field":"foo"}}
]
}
}
}, size=500) #this is the number of rows to return from the query... to get all queries, run script, see total number of hits, then set euqual to number >= total hits
random.seed(1)
sample = res['hits']['hits']
#comment previous line, and un-comment next line for a random sample instead
#randomsample = random.sample(res['hits']['hits'], 5); #change int to RANDOMLY SAMPLE a certain number of rows from your query
print("Got %d Hits:" % res['hits']['total'])
with open('outputfile.tsv', 'wb') as csvfile: #set name of output file here
filewriter = csv.writer(csvfile, delimiter='\t', # we use TAB delimited, to handle cases where freeform text may have a comma
quotechar='|', quoting=csv.QUOTE_MINIMAL)
# create header row
filewriter.writerow(["id", "column2", "column3"]) #change the column labels here
for hit in sample: #switch sample to randomsample if you want a random subset, instead of all rows
try: #try catch used to handle unstructured data, in cases where a field may not exist for a given hit
col1 = hit["_id"]
except Exception, e:
col1 = ""
try:
col2 = hit["some"]["deeply"]["nested"]["field"].decode('utf-8') #replace these nested key names with your own
col2 = col2.replace('\n', ' ')
except Exception, e:
col2 = ""
try:
col3 = hit["someother"]["deeply"]["nested"]["field"].decode('utf-8') #replace these nested key names with your own
col3 = col3.replace('\n', ' ')
except Exception, e:
col3 = ""
filewriter.writerow([col1,col2,col3])