-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCINERGI-SiteMap.py
218 lines (180 loc) · 7.31 KB
/
CINERGI-SiteMap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# coding: utf-8
#
# In[9]:
from datetime import datetime
import requests
import sys
# see http://docs.python-requests.org/en/master/user/quickstart/ for package documentation
geoportalBaseURL = 'http://datadiscoverystudio.org/geoportal/'
catalogISOmetadataBase = geoportalBaseURL + 'rest/metadata/item/'
print catalogISOmetadataBase
XML_HEADER = '<?xml version="1.0" encoding="UTF-8"?>'
fileLocationBase = 'c:\\tmp\\'
print fileLocationBase
sitemaptohtml = 'https://raw.githubusercontent.com/CINERGI/xmlsitemap/master/xml-sitemap.xsl'
#suggest copying the xslt file into the same directory with the sitemaps, in which case, use this
# value for sitemaptohtml:
# sitemaptohtml = 'xml-sitemap.xsl'
# first some utility functions for file generation and writing
def writeLinks( response, mfile ):
# writes entries in sitemap file, with URL for metadata record as html; the record
# is expected to include a schema.org JSON-LD script for use by the search indexers
for hit in response["hits"]["hits"]:
# hittitle = hit["_source"]["title"]
try:
hitid = hit["_id"]
hitmodified = hit["_source"]["sys_modified_dt"]
# print "title: ", hittitle, " id: ", hitid, " date: ", hitmodified
mfile.write('<url>')
mfile.write("\n")
# original CINERGI catalog location
#mfile.write('<loc>http://cinergi.sdsc.edu/geoportal/rest/metadata/item/'
# + hitid + '/html</loc>')
mfile.write('<loc>' + catalogISOmetadataBase + hitid + '/html</loc>')
mfile.write("\n")
mfile.write('<lastmod>' + hitmodified + '</lastmod>')
mfile.write("\n")
mfile.write('<changefreq>monthly</changefreq>')
mfile.write("\n")
# mfile.write('<priority>0.8</priority>')
# mfile.write("\n")
mfile.write('</url>')
mfile.write("\n")
except:
print("ERROR writing sitemap url for _id= " + hitid)
print(sys.exc_info()[1])
return
def indexFile():
# set up the sitemap index. This file has a link to each sitemap file.
# sitemaps are limited to 10000 entries, so if there is a bigger catalog, have
# to generate multiple sitemaps and point to them from the index.
try:
file_object = open(fileLocationBase + "DDSSiteIndex.xml", "w")
except:
print("ERROR: Can't open the index file, bailing out")
print(sys.exc_info()[1])
sys.exit(0)
# put in the header stuff
file_object.write(XML_HEADER)
file_object.write("\n")
file_object.write('<?xml-stylesheet type="text/xsl" href="' + sitemaptohtml + '"?>')
file_object.write('\n')
file_object.write('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
file_object.write("\n")
return(file_object)
def siteMapFile(name):
# opens a new empty sitemap file and returns the file_object for writing to it.
try:
file_object = open(fileLocationBase + name, "w")
except:
print("ERROR: Can't open the new sitemap file: " + name + ", bailing out")
print(sys.exc_info()[1])
sys.exit(0)
#put in the header stuff
file_object.write(XML_HEADER)
file_object.write('\n')
file_object.write('<?xml-stylesheet type="text/xsl" href="' + sitemaptohtml + '"?>')
file_object.write('\n')
file_object.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
file_object.write('\n')
return(file_object)
# construct Elasticsearch URL with search request
# espath="http://cinergi.sdsc.edu/geoportal/elastic/"
espath= geoportalBaseURL + "elastic/"
esindex="metadata"
esresource="/item/_search"
baseURL = espath+esindex+esresource
# need to use scrolling because there are >10000 records
# this is the time to live for the scroll index; renewed on each search call
p_scroll="1m"
#number of records to return in each batch.
# This will be the number of links in each sitemap file
p_size="10000"
#use this for testing
#p_size="10"
# the only field we need for the sitemap is the modified date
# comma delimited list of index fields to return from the _source section of the hits object
#p_source="sys_modified_dt,title"
p_source="sys_modified_dt"
# first get the scroll index to start scrolling loop, and the total number of records
counter = 0
filecount = 0
#print counter
#first request to get scrolling set up
p = {'scroll':p_scroll,
'size' : p_size,
'_source' : p_source}
r = requests.get(baseURL, params=p)
print "request1: ", r.url
if r.status_code == requests.codes.ok:
response = r.json()
totalRecords = response["hits"]["total"]
scrollID = response["_scroll_id"]
# set up the index file
indexhandle = indexFile()
print "total records: ", totalRecords
sitemapfilename = "ddssitemap" + str(filecount)+ ".xml"
sitemaphandle = siteMapFile(sitemapfilename)
writeLinks(response, sitemaphandle)
sitemaphandle.write('</urlset>')
sitemaphandle.close()
#new index entry
indexhandle.write('<sitemap>')
indexhandle.write('\n')
# indexhandle.write('<loc>http://cinergi.sdsc.edu/geoportal/' + sitemapfilename + '</loc>')
# providing a full URL to put links in the sitemap index:
# indexhandle.write('<loc>' + geoportalBaseURL + sitemapfilename + '</loc>')
# using local file paths also works, and is likely easier to maintain in the long run:
indexhandle.write('<loc>' + sitemapfilename + '</loc>')
indexhandle.write('\n')
indexhandle.write('<lastmod>' + str(datetime.now())+ '</lastmod>')
indexhandle.write('\n')
indexhandle.write('</sitemap>')
indexhandle.write('\n')
filecount = filecount + 1
counter = counter + int(p_size)
else:
r.raise_for_status()
sys.exit(0)
while counter < totalRecords:
# use this for testing:
#while counter < 50:
#have to hit the scroll resource for Elasticsearch
esresource="_search/scroll"
#Geoportal Elasticsearch pass through requires publisher role to run the scroll resource
espath="http://admin:[email protected]/geoportal/elastic/"
baseURL = espath+esresource
p = { 'scroll':p_scroll,
'scroll_id' : scrollID}
r = requests.get(baseURL, params=p)
# print "request: ", r.url, r.status_code
# print "raw response2: ", r, " status: ", r.status_code
# print r.headers['content-type']
if r.status_code == requests.codes.ok:
response = r.json()
scrollID = response["_scroll_id"]
sitemapfilename = "ddssitemap" + str(filecount)+ ".xml"
sitemaphandle = siteMapFile(sitemapfilename)
writeLinks(response, sitemaphandle)
sitemaphandle.write('</urlset>')
sitemaphandle.close()
#new index entry
indexhandle.write('<sitemap>')
indexhandle.write('\n')
indexhandle.write('<loc>' + geoportalBaseURL + sitemapfilename + '</loc>')
indexhandle.write('\n')
indexhandle.write('<lastmod>' + str(datetime.now())+ '</lastmod>')
indexhandle.write('\n')
indexhandle.write('</sitemap>')
indexhandle.write('\n')
filecount = filecount + 1
counter = counter + int(p_size)
print "count: ", counter
else:
r.raise_for_status()
break
indexhandle.write('</sitemapindex>')
indexhandle.close()
print "done, counter = ",counter
# In[ ]:
# In[ ]: