Skip to content

Commit e2ea1eb

Browse files
KristofferCKristofferC
authored andcommitted
compress the version map a bit
we use a compression where for a given stdlib with given dependencies we associate that with a julia range. Within that julia range we also store how the version of the stdlib itself has evolved. The compression and uncompression code were written by an LLM after my instruction of how it should be compressed.
1 parent 047f9f7 commit e2ea1eb

File tree

8 files changed

+1594
-15375
lines changed

8 files changed

+1594
-15375
lines changed

ext/HistoricalStdlibGenerator/Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,6 @@
22
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
33
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
44
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
5+
Runic = "62bfec6d-59d7-401d-8490-b29ee721c001"
56
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
67
Scratch = "6c6a2e73-6563-6170-7368-637461726353"
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
# Compression functions for stdlib version data
2+
3+
# Analyze UUID usage for constants
4+
function analyze_uuid_usage(stdlibs_by_version)
5+
uuid_counts = Dict{UUID, Int}()
6+
uuid_to_name = Dict{UUID, String}()
7+
8+
for (_, stdlib_dict) in stdlibs_by_version
9+
for (uuid, info) in stdlib_dict
10+
uuid_counts[uuid] = get(uuid_counts, uuid, 0) + 1
11+
uuid_to_name[uuid] = info.name
12+
13+
for dep_uuid in info.deps
14+
uuid_counts[dep_uuid] = get(uuid_counts, dep_uuid, 0) + 1
15+
end
16+
17+
for weakdep_uuid in info.weakdeps
18+
uuid_counts[weakdep_uuid] = get(uuid_counts, weakdep_uuid, 0) + 1
19+
end
20+
end
21+
end
22+
23+
return uuid_counts, uuid_to_name
24+
end
25+
26+
# Group by UUID
27+
function group_by_uuid(stdlibs_by_version)
28+
uuid_to_versions = Dict{UUID, Vector{Tuple{VersionNumber, StdlibInfo}}}()
29+
30+
for (version, stdlib_dict) in stdlibs_by_version
31+
for (uuid, info) in stdlib_dict
32+
if !haskey(uuid_to_versions, uuid)
33+
uuid_to_versions[uuid] = []
34+
end
35+
push!(uuid_to_versions[uuid], (version, info))
36+
end
37+
end
38+
39+
for (uuid, versions) in uuid_to_versions
40+
sort!(versions, by=x->x[1])
41+
end
42+
43+
return uuid_to_versions
44+
end
45+
46+
# Compare base info (everything except version)
47+
function base_info_equal(a::StdlibInfo, b::StdlibInfo)
48+
a.name == b.name &&
49+
a.uuid == b.uuid &&
50+
a.deps == b.deps &&
51+
a.weakdeps == b.weakdeps
52+
end
53+
54+
# Find segments where base info is constant
55+
function find_base_info_segments(versions_and_infos)
56+
segments = []
57+
i = 1
58+
59+
while i <= length(versions_and_infos)
60+
segment_start_idx = i
61+
segment_base_info = versions_and_infos[i][2]
62+
j = i + 1
63+
64+
# Find consecutive entries with same base info
65+
while j <= length(versions_and_infos)
66+
if base_info_equal(versions_and_infos[j][2], segment_base_info)
67+
j += 1
68+
else
69+
break
70+
end
71+
end
72+
73+
segment_end_idx = j - 1
74+
segment_entries = versions_and_infos[segment_start_idx:segment_end_idx]
75+
76+
# Within this segment, find version ranges based on package version
77+
version_ranges = []
78+
k = 1
79+
while k <= length(segment_entries)
80+
range_start_version = segment_entries[k][1]
81+
current_pkg_version = segment_entries[k][2].version
82+
m = k + 1
83+
84+
while m <= length(segment_entries)
85+
if segment_entries[m][2].version == current_pkg_version
86+
m += 1
87+
else
88+
break
89+
end
90+
end
91+
92+
range_end_version = segment_entries[m-1][1]
93+
push!(version_ranges, ((range_start_version, range_end_version), current_pkg_version))
94+
k = m
95+
end
96+
97+
push!(segments, (segment_base_info, version_ranges))
98+
i = j
99+
end
100+
101+
return segments
102+
end
103+
104+
# Analyze stdlib patterns with segmentation
105+
function analyze_stdlib_patterns(uuid_to_versions)
106+
uuid_to_segments = Dict{UUID, Vector}()
107+
108+
for (uuid, versions_and_infos) in uuid_to_versions
109+
segments = find_base_info_segments(versions_and_infos)
110+
uuid_to_segments[uuid] = segments
111+
end
112+
113+
return uuid_to_segments
114+
end
115+
116+
# Format UUID reference
117+
function format_uuid(uuid, uuid_constants)
118+
get(uuid_constants, uuid, "UUID(\"$(uuid)\")")
119+
end
120+
121+
# Format base info struct
122+
function format_base_info(info, uuid_constants)
123+
# Build keyword arguments, omitting defaults
124+
parts = ["name = \"$(info.name)\"", "uuid = $(format_uuid(info.uuid, uuid_constants))"]
125+
126+
if !isempty(info.deps)
127+
deps_str = "UUID[" * join([format_uuid(d, uuid_constants) for d in info.deps], ", ") * "]"
128+
push!(parts, "deps = $(deps_str)")
129+
end
130+
131+
if !isempty(info.weakdeps)
132+
weakdeps_str = "UUID[" * join([format_uuid(d, uuid_constants) for d in info.weakdeps], ", ") * "]"
133+
push!(parts, "weakdeps = $(weakdeps_str)")
134+
end
135+
136+
args_str = join(parts, ", ")
137+
return "StdlibBaseInfo($(args_str))"
138+
end
139+
140+
# Write compressed version map to file
141+
function write_compressed_version_map(output_fname, stdlibs_by_version, unregistered_stdlibs)
142+
# Analyze and prepare compression
143+
@info("Analyzing version map for compression...")
144+
uuid_counts, uuid_to_name = analyze_uuid_usage(stdlibs_by_version)
145+
146+
# Define constants for frequently used UUIDs
147+
const_threshold = 5
148+
uuid_constants = Dict{UUID, String}()
149+
150+
for (uuid, count) in uuid_counts
151+
if count >= const_threshold && haskey(uuid_to_name, uuid)
152+
name = uuid_to_name[uuid]
153+
const_name = "$(name)_uuid"
154+
const_name = replace(const_name, r"[^a-zA-Z0-9_]" => "_")
155+
uuid_constants[uuid] = const_name
156+
end
157+
end
158+
159+
uuid_to_versions = group_by_uuid(stdlibs_by_version)
160+
uuid_to_segments = analyze_stdlib_patterns(uuid_to_versions)
161+
162+
# Output compressed version map
163+
@info("Outputting compressed version map to $(output_fname)")
164+
open(output_fname, "w") do io
165+
println(io, "## This file autogenerated by ext/HistoricalStdlibGenerator/generate_historical_stdlibs.jl")
166+
println(io)
167+
println(io, "# Julia standard libraries with segment-based compression:")
168+
println(io, "# - Each stdlib split into segments where base info (name, uuid, deps, weakdeps) is constant")
169+
println(io, "# - Within each segment, only package version numbers stored per Julia version range")
170+
println(io)
171+
172+
# Write UUID constants
173+
if !isempty(uuid_constants)
174+
println(io, "# UUID constants")
175+
for (uuid, const_name) in sort(collect(uuid_constants), by=x->x[2])
176+
println(io, "const $(const_name) = UUID(\"$(uuid)\")")
177+
end
178+
println(io)
179+
end
180+
181+
# Write stdlib info with segments
182+
println(io, "# Format: UUID => [StdlibSegment(...), ...]")
183+
println(io, "const STDLIB_SEGMENTS = Dict{UUID, Vector{StdlibSegment}}(")
184+
185+
sorted_uuids = sort(collect(keys(uuid_to_segments)), by=u->uuid_to_name[u])
186+
187+
for (idx, uuid) in enumerate(sorted_uuids)
188+
segments = uuid_to_segments[uuid]
189+
uuid_str = format_uuid(uuid, uuid_constants)
190+
191+
println(io, " $(uuid_str) => [")
192+
193+
for (seg_idx, (base_info, version_ranges)) in enumerate(segments)
194+
println(io, " StdlibSegment(")
195+
println(io, " base_info = ", format_base_info(base_info, uuid_constants), ",")
196+
println(io, " version_ranges = [")
197+
198+
for (range_idx, ((start_v, end_v), ver)) in enumerate(version_ranges)
199+
ver_str = isnothing(ver) ? "nothing" : "v\"$(ver)\""
200+
comma = range_idx < length(version_ranges) ? "," : ""
201+
if start_v == end_v
202+
println(io, " (v\"$(start_v)\", v\"$(start_v)\") => $(ver_str)$(comma)")
203+
else
204+
println(io, " (v\"$(start_v)\", v\"$(end_v)\") => $(ver_str)$(comma)")
205+
end
206+
end
207+
208+
print(io, " ],")
209+
println(io)
210+
print(io, " )")
211+
println(io, seg_idx < length(segments) ? "," : "")
212+
end
213+
214+
print(io, " ]")
215+
println(io, idx < length(sorted_uuids) ? "," : "")
216+
end
217+
218+
println(io, ")")
219+
println(io)
220+
221+
# Write UNREGISTERED_STDLIBS
222+
print(io, """
223+
# Next, we also embed a list of stdlibs that must _always_ be treated as stdlibs,
224+
# because they cannot be resolved in the registry; they have only ever existed within
225+
# the Julia stdlib source tree, and because of that, trying to resolve them will fail.
226+
""")
227+
println(io, "const UNREGISTERED_STDLIBS = Dict{UUID,StdlibInfo}(")
228+
for (idx, (uuid, info)) in enumerate(sort(collect(unregistered_stdlibs), by=x->x[2].name))
229+
uuid_str = format_uuid(uuid, uuid_constants)
230+
deps_str = isempty(info.deps) ? "UUID[]" : "UUID[" * join([format_uuid(d, uuid_constants) for d in info.deps], ", ") * "]"
231+
weakdeps_str = isempty(info.weakdeps) ? "UUID[]" : "UUID[" * join([format_uuid(d, uuid_constants) for d in info.weakdeps], ", ") * "]"
232+
ver_str = isnothing(info.version) ? "nothing" : "v\"$(info.version)\""
233+
234+
println(io, " $(uuid_str) => StdlibInfo(")
235+
println(io, " \"$(info.name)\",")
236+
println(io, " $(uuid_str),")
237+
println(io, " $(ver_str),")
238+
println(io, " $(deps_str),")
239+
println(io, " $(weakdeps_str),")
240+
println(io, " )", idx < length(unregistered_stdlibs) ? "," : "")
241+
end
242+
println(io, ")")
243+
end
244+
end

ext/HistoricalStdlibGenerator/generate_historical_stdlibs.jl

Lines changed: 22 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#!/usr/bin/env julia
22

3-
using Downloads, JSON3, Base.BinaryPlatforms, Scratch, SHA, Pkg, TOML
3+
using Downloads, JSON3, Base.BinaryPlatforms, Scratch, SHA, Pkg, TOML, Runic
44
include("../../src/StdlibInfo.jl")
5+
include("compress.jl")
56

67
# Work around issues where we attempt to `eval()` code from Julia versions
78
# that have `Pkg.Types.StdlibInfo` (and embed that exact symbol path)
@@ -256,48 +257,26 @@ unregistered_stdlibs = filter(all_stdlibs) do (uuid, _)
256257
return !any(haskey(reg.pkgs, uuid) for reg in registries)
257258
end
258259

259-
# Helper function for getting these printed out in a nicely-sorted order
260-
function print_sorted(io::IO, d::Dict; indent::Int=0)
261-
println(io, "Dict{UUID,StdlibInfo}(")
262-
for (uuid, (name, version, deps, weakdeps)) in sort(collect(d), by = kv-> kv[2][1])
263-
println(io,
264-
" "^indent,
265-
repr(uuid), " => StdlibInfo(\n",
266-
" "^(indent + 4), repr(name), ",\n",
267-
" "^(indent + 4), repr(uuid), ",\n",
268-
" "^(indent + 4), repr(version), ",\n",
269-
" "^(indent + 4), repr(sort(deps)), ",\n",
270-
" "^(indent + 4), repr(sort(weakdeps)), ",\n",
271-
" "^indent, "),",
272-
)
273-
end
274-
print(io, " "^(max(indent - 4, 0)), ")")
275-
end
260+
# Convert versions_dict to the format expected by compression functions
261+
# Convert tuples (name, version, deps, weakdeps) to StdlibInfo objects
262+
stdlibs_by_version = [
263+
v => Dict{UUID, StdlibInfo}(
264+
uuid => StdlibInfo(info[1], uuid, info[2], info[3], info[4])
265+
for (uuid, info) in stdlib_dict
266+
)
267+
for (v, stdlib_dict) in [(v, versions_dict[v]) for v in sort(collect(keys(versions_dict)))]
268+
]
276269

277-
output_fname = joinpath(dirname(dirname(@__DIR__)), "src", "version_map.jl")
278-
@info("Outputting to $(output_fname)")
279-
sorted_versions = sort(collect(keys(versions_dict)))
280-
open(output_fname, "w") do io
281-
print(io, """
282-
## This file autogenerated by ext/HistoricalStdlibGenerator/generate_historical_stdlibs.jl
270+
# Convert unregistered_stdlibs tuples to StdlibInfo objects
271+
unregistered_stdlibs_info = Dict{UUID, StdlibInfo}(
272+
uuid => StdlibInfo(info[1], uuid, info[2], info[3], info[4])
273+
for (uuid, info) in unregistered_stdlibs
274+
)
283275

284-
# Julia standard libraries with duplicate entries removed so as to store only the
285-
# first release in a set of releases that all contain the same set of stdlibs.
286-
const STDLIBS_BY_VERSION = [
287-
""")
288-
for v in sorted_versions
289-
print(io, " $(repr(v)) => ")
290-
print_sorted(io, versions_dict[v]; indent=8)
291-
println(io, ",")
292-
println(io)
293-
end
294-
println(io, "]")
276+
# Write compressed version map
277+
output_fname = joinpath(dirname(dirname(@__DIR__)), "src", "version_map_compressed.jl")
278+
write_compressed_version_map(output_fname, stdlibs_by_version, unregistered_stdlibs_info)
295279

296-
println(io)
297-
print(io, """
298-
# Next, we also embed a list of stdlibs that must _always_ be treated as stdlibs,
299-
# because they cannot be resolved in the registry; they have only ever existed within
300-
# the Julia stdlib source tree, and because of that, trying to resolve them will fail.
301-
const UNREGISTERED_STDLIBS =""")
302-
print_sorted(io, unregistered_stdlibs; indent=4)
303-
end
280+
# Format the output file with Runic
281+
@info("Formatting output file with Runic...")
282+
Runic.format(output_fname; overwrite=true)

src/HistoricalStdlibVersions.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ module HistoricalStdlibVersions
77
using Pkg
88
using PrecompileTools: @setup_workload, @compile_workload
99
include("StdlibInfo.jl")
10-
include("version_map.jl")
10+
include("version_map_compressed.jl")
11+
include("uncompress.jl")
1112

1213
let
1314
max_hsg_version = maximum(first.(STDLIBS_BY_VERSION))

src/StdlibInfo.jl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,17 @@ else
1919
import Pkg.Types: StdlibInfo
2020
end
2121

22+
# Base info struct for stdlib segments (excludes version)
23+
Base.@kwdef struct StdlibBaseInfo
24+
name::String
25+
uuid::UUID
26+
deps::Vector{UUID} = UUID[]
27+
weakdeps::Vector{UUID} = UUID[]
28+
end
29+
30+
# Segment struct that combines base info with version ranges
31+
Base.@kwdef struct StdlibSegment
32+
base_info::StdlibBaseInfo
33+
version_ranges::Vector{Pair{Tuple{VersionNumber,VersionNumber}, Union{Nothing,VersionNumber}}}
34+
end
35+

0 commit comments

Comments
 (0)