-
Notifications
You must be signed in to change notification settings - Fork 156
/
repo.rb
252 lines (216 loc) · 7.69 KB
/
repo.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
require 'application_utils'
class Repo
attr_reader :logger, :path, :heads, :tags, :repo, :rebuild_all
# Clone with --mirror:
#
# git clone --mirror git://github.com/rails/rails.git
#
PATH = "#{Rails.root}/rails.git"
HEADS = %r{\Arefs/heads/(main|[\d\-]+(-stable)?)\z}
TAGS = %r{\Arefs/tags/v[\d.]+\z}
# This is the entry point to sync the database from cron jobs etc:
#
# bundle exec rails runner Repo.sync
#
# is the intended usage. This fetches new stuff, imports new commits if any,
# imports new releases if any, assigns contributors and updates ranks.
#
# If the names manager has been updated since the previous execution special
# code detects names that are gone and recomputes the contributors for their
# commits. This can be forced by passing rebuild_all: true.
def self.sync(path: PATH, heads: HEADS, tags: TAGS, rebuild_all: false)
new(path: path, heads: heads, tags: tags, rebuild_all: rebuild_all).sync
end
def initialize(path: PATH, heads: HEADS, tags: TAGS, rebuild_all: false)
@logger = Rails.logger
@path = path
@heads = heads
@tags = tags
@rebuild_all = rebuild_all || names_mapping_updated?
@repo = Rugged::Repository.new(path)
end
# Executes a git command, optionally capturing its output.
#
# If the execution is not successful +StandardError+ is raised.
def git(args, capture=false)
cmd = "git #{args}"
logger.info(cmd)
Dir.chdir(path) do
if capture
out = `#{cmd}`
return out if $?.success?
raise "git error: #{$?}"
else
system(cmd) or raise "git error: #{$?}"
end
end
end
# Issues a git fetch.
def fetch
git 'fetch --quiet --prune'
end
# Returns the patch of the given commit.
def diff(sha1)
git "diff --no-color #{sha1}^!", true
end
# Returns the commits between +from+ and +to+. That is, that a reachable from
# +to+, but not from +from+.
#
# We use this method to determine which commits belong to a release.
def rev_list(from, to)
arg = from ? "#{from}..#{to}" : to
lines = git "rev-list #{arg}", true
lines.split("\n")
end
# This method does the actual work behind Repo.sync.
def sync
ApplicationUtils.acquiring_lock_file('updating') do
started_at = Time.current
fetch
ActiveRecord::Base.transaction do
ncommits = sync_commits
nreleases = sync_releases
if ncommits > 0 || nreleases > 0 || rebuild_all
sync_names
sync_ranks
sync_first_contribution_timestamps
end
RepoUpdate.create!(
ncommits: ncommits,
nreleases: nreleases,
started_at: started_at,
ended_at: Time.current,
rebuild_all: rebuild_all
)
ApplicationUtils.expire_cache if cache_needs_expiration?(ncommits, nreleases)
end
end
end
protected
def refs(regexp)
repo.refs.select do |ref|
ref.name =~ regexp
end
end
# Imports those commits in the Git repo that do not yet exist in the database
# by walking the main and stable branches backwards starting at the tips
# and following parents.
def sync_commits
ncommits = 0
ActiveRecord::Base.logger.silence do
refs(heads).each do |ref|
to_visit = [repo.lookup(ref.target.oid)]
while commit = to_visit.shift
unless Commit.exists?(sha1: commit.oid)
ncommits += 1
Commit.import!(commit)
to_visit.concat(commit.parents)
end
end
end
end
ncommits
end
# Imports new releases, if any, determines which commits belong to them, and
# associates them. By definition, a release corresponds to a stable tag, one
# that matches <tt>\Av[\d.]+\z</tt>.
def sync_releases
new_releases = []
refs(tags).each do |ref|
tag = ref.name[%r{[^/]+\z}]
unless Release.exists?(tag: tag)
target = ref.target
commit = target.is_a?(Rugged::Commit) ? target : target.target
new_releases << Release.import!(tag, commit)
end
end
Release.process_commits(self, new_releases)
new_releases.size
end
# Computes the name of the contributors and adjusts associations and the
# names table. If some names are gone due to new mappings collapsing two
# names into one, for example, the credit for commits of gone names is
# revised, resulting in the canonical name being associated.
def sync_names
Contribution.delete_all if rebuild_all
assign_contributors
Contributor.with_no_commits.delete_all if rebuild_all
end
# Once all tables have been updated we compute the rank of each contributor.
def sync_ranks
i = 0
prev_ncommits = nil
new_rank = 0
ranks_to_update = Hash.new {|h, k| h[k] = []}
# Compute new ranks, and store those which need to be updated.
Contributor.all_with_ncommits.each do |contributor|
i += 1
if contributor.ncommits != prev_ncommits
new_rank = i
prev_ncommits = contributor.ncommits
end
if contributor.rank != new_rank
ranks_to_update[new_rank] << contributor.id
end
end
# Update new ranks, if any.
ranks_to_update.each do |rank, contributor_ids|
Contributor.where(id: contributor_ids).update_all("rank = #{rank}")
end
end
def sync_first_contribution_timestamps
Contributor.set_first_contribution_timestamps(!rebuild_all)
end
# Determines whether the names mapping has been updated. This is useful because
# if the names mapping is up to date we only need to assign contributors for
# new commits.
def names_mapping_updated?
@nmu ||= begin
lastru = RepoUpdate.last
# Use started_at in case a revised names manager is deployed while an update
# is running.
lastru ? NamesManager.updated_since?(lastru.started_at) : true
end
end
# Goes over all or new commits in the database and builds a hash that maps
# each sha1 to the array of the canonical names of their contributors.
#
# This computation ignores the current contributions table altogether, it
# only takes into account the current mapping rules for name resolution.
def compute_contributor_names_per_commit
Hash.new {|h, sha1| h[sha1] = []}.tap do |contributor_names_per_commit|
Commit.with_no_contributors.find_each do |commit|
commit.extract_contributor_names(self).each do |contributor_name|
contributor_names_per_commit[commit.sha1] << contributor_name
end
end
end
end
# Iterates over all commits with no contributors and assigns to them the ones
# in the previously computed <tt>contributor_names_per_commit</tt>.
def assign_contributors
contributor_names_per_commit = compute_contributor_names_per_commit
contributors = Hash.new {|h, name| h[name] = Contributor.find_or_create_by(name: name)}
data = []
Commit.with_no_contributors.find_each do |commit|
contributor_names_per_commit[commit.sha1].each do |contributor_name|
# FIXME: This check is needed because creation in a few exceptional
# cases fails due to url_id collisions (Geoffrey ROGUELON, Adam), or
# due blank url_ids (प्रथमेश).
if contributors[contributor_name].id
data << "#{contributors[contributor_name].id},#{commit.id}\n"
end
end
end
conn = ActiveRecord::Base.connection.raw_connection
conn.copy_data('COPY contributions (contributor_id, commit_id) FROM STDIN CSV') do
data.each do |row|
conn.put_copy_data(row)
end
end
end
# Do we need to expire the cached pages?
def cache_needs_expiration?(ncommits, nreleases)
ncommits > 0 || nreleases > 0 || rebuild_all
end
end