-
Notifications
You must be signed in to change notification settings - Fork 1
/
build.rb
executable file
·101 lines (89 loc) · 2.9 KB
/
build.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env ruby
require 'date'
require 'pdf-reader'
require 'open-uri'
require 'json'
require 'net/http'
require 'active_support'
require 'active_support/core_ext/date/calculations'
def data_file(filename)
if(ENV['APP_ENV'] == "production")
"https://nytimes-covid-frontpage.s3.amazonaws.com/#{filename}"
else
File.join("data",filename)
end
end
def nytimes_url(date)
"https://static01.nyt.com/images/#{date.strftime('%Y/%m/%d')}/nytfrontpage/scan.pdf"
end
def download_nytimes_frontpage(start_date, end_date)
( start_date .. end_date ).each do |current_time|
puts "🌎 Checking URL #{nytimes_url(current_time)}"
filename = File.join("pdfs", "#{current_time.to_s}.pdf")
if !File.file?(filename) || File.zero?(filename)
begin
puts "📗 Download new file: #{current_time.to_s}.pdf"
File.open(filename, "wb") do |file|
file.write open(nytimes_url(current_time)).read
end
rescue => error
puts "❌ Failed to save #{nytimes_url(current_time)} / #{error}"
end
else
puts "🗄️ Using cached file: #{current_time.to_s}.pdf"
end
end
end
def generate_word_count_from_pdf
puts "🧮 Generating word count..."
results = {}
Dir[File.join("pdfs", "*.pdf")].each do |filename|
begin
puts "🔍 Analyzing... #{filename}"
reader = PDF::Reader.new(filename)
reader.pages.each do |page|
# Advancing 1 day works around a cal-heatmap off-by-one date bug
current_time = Date.parse(File.basename(filename, File.extname(filename))).advance(days: 1).to_time.to_i
results.merge!("#{current_time}": page.text.downcase.scan(/(?=(corona|covid|virus|pandemic|wuhan))/).count)
end
rescue => error
puts "❌ Failed to read #{filename} / #{error}"
end
end
puts "⌛ Processed #{results.count} items"
results
end
def save_json(data, filename)
File.open(File.join("data",filename),"w") do |f|
f.write(JSON.generate(data))
end
end
def json_incremented?(new_data, old_data_filename)
begin
if File.file?(old_data_filename)
response = File.open(old_data_filenamename).read
elsif old_data_filename =~ URI::regexp
uri = URI(data_file(old_data_filename))
response = Net::HTTP.get(uri)
else
response = "{}"
end
old_data = JSON.parse(response)
rescue => error
puts "❌ Error downloading / #{error}"
end
puts "New count: #{new_data.count} Old count: #{old_data.count}"
new_data.count > old_data.count
end
def main(start_date:, end_date:)
puts "APP_ENV=#{ENV['APP_ENV'] || 'development'}"
download_nytimes_frontpage(start_date, end_date)
new_data = generate_word_count_from_pdf
if json_incremented?(new_data, "results.json")
puts "Writing new JSON file"
save_json(new_data, "results.json")
else
puts "No need to generate a new JSON"
end
end
main(start_date: DateTime.parse('2020-01-01').to_date, end_date: Date.today)