|
| 1 | +# Compression functions for stdlib version data |
| 2 | + |
| 3 | +# Analyze UUID usage for constants |
| 4 | +function analyze_uuid_usage(stdlibs_by_version) |
| 5 | + uuid_counts = Dict{UUID, Int}() |
| 6 | + uuid_to_name = Dict{UUID, String}() |
| 7 | + |
| 8 | + for (_, stdlib_dict) in stdlibs_by_version |
| 9 | + for (uuid, info) in stdlib_dict |
| 10 | + uuid_counts[uuid] = get(uuid_counts, uuid, 0) + 1 |
| 11 | + uuid_to_name[uuid] = info.name |
| 12 | + |
| 13 | + for dep_uuid in info.deps |
| 14 | + uuid_counts[dep_uuid] = get(uuid_counts, dep_uuid, 0) + 1 |
| 15 | + end |
| 16 | + |
| 17 | + for weakdep_uuid in info.weakdeps |
| 18 | + uuid_counts[weakdep_uuid] = get(uuid_counts, weakdep_uuid, 0) + 1 |
| 19 | + end |
| 20 | + end |
| 21 | + end |
| 22 | + |
| 23 | + return uuid_counts, uuid_to_name |
| 24 | +end |
| 25 | + |
| 26 | +# Group by UUID |
| 27 | +function group_by_uuid(stdlibs_by_version) |
| 28 | + uuid_to_versions = Dict{UUID, Vector{Tuple{VersionNumber, StdlibInfo}}}() |
| 29 | + |
| 30 | + for (version, stdlib_dict) in stdlibs_by_version |
| 31 | + for (uuid, info) in stdlib_dict |
| 32 | + if !haskey(uuid_to_versions, uuid) |
| 33 | + uuid_to_versions[uuid] = [] |
| 34 | + end |
| 35 | + push!(uuid_to_versions[uuid], (version, info)) |
| 36 | + end |
| 37 | + end |
| 38 | + |
| 39 | + for (uuid, versions) in uuid_to_versions |
| 40 | + sort!(versions, by=x->x[1]) |
| 41 | + end |
| 42 | + |
| 43 | + return uuid_to_versions |
| 44 | +end |
| 45 | + |
| 46 | +# Compare base info (everything except version) |
| 47 | +function base_info_equal(a::StdlibInfo, b::StdlibInfo) |
| 48 | + a.name == b.name && |
| 49 | + a.uuid == b.uuid && |
| 50 | + a.deps == b.deps && |
| 51 | + a.weakdeps == b.weakdeps |
| 52 | +end |
| 53 | + |
| 54 | +# Find segments where base info is constant |
| 55 | +function find_base_info_segments(versions_and_infos) |
| 56 | + segments = [] |
| 57 | + i = 1 |
| 58 | + |
| 59 | + while i <= length(versions_and_infos) |
| 60 | + segment_start_idx = i |
| 61 | + segment_base_info = versions_and_infos[i][2] |
| 62 | + j = i + 1 |
| 63 | + |
| 64 | + # Find consecutive entries with same base info |
| 65 | + while j <= length(versions_and_infos) |
| 66 | + if base_info_equal(versions_and_infos[j][2], segment_base_info) |
| 67 | + j += 1 |
| 68 | + else |
| 69 | + break |
| 70 | + end |
| 71 | + end |
| 72 | + |
| 73 | + segment_end_idx = j - 1 |
| 74 | + segment_entries = versions_and_infos[segment_start_idx:segment_end_idx] |
| 75 | + |
| 76 | + # Within this segment, find version ranges based on package version |
| 77 | + version_ranges = [] |
| 78 | + k = 1 |
| 79 | + while k <= length(segment_entries) |
| 80 | + range_start_version = segment_entries[k][1] |
| 81 | + current_pkg_version = segment_entries[k][2].version |
| 82 | + m = k + 1 |
| 83 | + |
| 84 | + while m <= length(segment_entries) |
| 85 | + if segment_entries[m][2].version == current_pkg_version |
| 86 | + m += 1 |
| 87 | + else |
| 88 | + break |
| 89 | + end |
| 90 | + end |
| 91 | + |
| 92 | + range_end_version = segment_entries[m-1][1] |
| 93 | + push!(version_ranges, ((range_start_version, range_end_version), current_pkg_version)) |
| 94 | + k = m |
| 95 | + end |
| 96 | + |
| 97 | + push!(segments, (segment_base_info, version_ranges)) |
| 98 | + i = j |
| 99 | + end |
| 100 | + |
| 101 | + return segments |
| 102 | +end |
| 103 | + |
| 104 | +# Analyze stdlib patterns with segmentation |
| 105 | +function analyze_stdlib_patterns(uuid_to_versions) |
| 106 | + uuid_to_segments = Dict{UUID, Vector}() |
| 107 | + |
| 108 | + for (uuid, versions_and_infos) in uuid_to_versions |
| 109 | + segments = find_base_info_segments(versions_and_infos) |
| 110 | + uuid_to_segments[uuid] = segments |
| 111 | + end |
| 112 | + |
| 113 | + return uuid_to_segments |
| 114 | +end |
| 115 | + |
| 116 | +# Format UUID reference |
| 117 | +function format_uuid(uuid, uuid_constants) |
| 118 | + get(uuid_constants, uuid, "UUID(\"$(uuid)\")") |
| 119 | +end |
| 120 | + |
| 121 | +# Format base info struct |
| 122 | +function format_base_info(info, uuid_constants) |
| 123 | + # Build keyword arguments, omitting defaults |
| 124 | + parts = ["name = \"$(info.name)\"", "uuid = $(format_uuid(info.uuid, uuid_constants))"] |
| 125 | + |
| 126 | + if !isempty(info.deps) |
| 127 | + deps_str = "UUID[" * join([format_uuid(d, uuid_constants) for d in info.deps], ", ") * "]" |
| 128 | + push!(parts, "deps = $(deps_str)") |
| 129 | + end |
| 130 | + |
| 131 | + if !isempty(info.weakdeps) |
| 132 | + weakdeps_str = "UUID[" * join([format_uuid(d, uuid_constants) for d in info.weakdeps], ", ") * "]" |
| 133 | + push!(parts, "weakdeps = $(weakdeps_str)") |
| 134 | + end |
| 135 | + |
| 136 | + args_str = join(parts, ", ") |
| 137 | + return "StdlibBaseInfo($(args_str))" |
| 138 | +end |
| 139 | + |
| 140 | +# Write compressed version map to file |
| 141 | +function write_compressed_version_map(output_fname, stdlibs_by_version, unregistered_stdlibs) |
| 142 | + # Analyze and prepare compression |
| 143 | + @info("Analyzing version map for compression...") |
| 144 | + uuid_counts, uuid_to_name = analyze_uuid_usage(stdlibs_by_version) |
| 145 | + |
| 146 | + # Define constants for frequently used UUIDs |
| 147 | + const_threshold = 5 |
| 148 | + uuid_constants = Dict{UUID, String}() |
| 149 | + |
| 150 | + for (uuid, count) in uuid_counts |
| 151 | + if count >= const_threshold && haskey(uuid_to_name, uuid) |
| 152 | + name = uuid_to_name[uuid] |
| 153 | + const_name = "$(name)_uuid" |
| 154 | + const_name = replace(const_name, r"[^a-zA-Z0-9_]" => "_") |
| 155 | + uuid_constants[uuid] = const_name |
| 156 | + end |
| 157 | + end |
| 158 | + |
| 159 | + uuid_to_versions = group_by_uuid(stdlibs_by_version) |
| 160 | + uuid_to_segments = analyze_stdlib_patterns(uuid_to_versions) |
| 161 | + |
| 162 | + # Output compressed version map |
| 163 | + @info("Outputting compressed version map to $(output_fname)") |
| 164 | + open(output_fname, "w") do io |
| 165 | + println(io, "## This file autogenerated by ext/HistoricalStdlibGenerator/generate_historical_stdlibs.jl") |
| 166 | + println(io) |
| 167 | + println(io, "# Julia standard libraries with segment-based compression:") |
| 168 | + println(io, "# - Each stdlib split into segments where base info (name, uuid, deps, weakdeps) is constant") |
| 169 | + println(io, "# - Within each segment, only package version numbers stored per Julia version range") |
| 170 | + println(io) |
| 171 | + |
| 172 | + # Write UUID constants |
| 173 | + if !isempty(uuid_constants) |
| 174 | + println(io, "# UUID constants") |
| 175 | + for (uuid, const_name) in sort(collect(uuid_constants), by=x->x[2]) |
| 176 | + println(io, "const $(const_name) = UUID(\"$(uuid)\")") |
| 177 | + end |
| 178 | + println(io) |
| 179 | + end |
| 180 | + |
| 181 | + # Write stdlib info with segments |
| 182 | + println(io, "# Format: UUID => [StdlibSegment(...), ...]") |
| 183 | + println(io, "const STDLIB_SEGMENTS = Dict{UUID, Vector{StdlibSegment}}(") |
| 184 | + |
| 185 | + sorted_uuids = sort(collect(keys(uuid_to_segments)), by=u->uuid_to_name[u]) |
| 186 | + |
| 187 | + for (idx, uuid) in enumerate(sorted_uuids) |
| 188 | + segments = uuid_to_segments[uuid] |
| 189 | + uuid_str = format_uuid(uuid, uuid_constants) |
| 190 | + |
| 191 | + println(io, " $(uuid_str) => [") |
| 192 | + |
| 193 | + for (seg_idx, (base_info, version_ranges)) in enumerate(segments) |
| 194 | + println(io, " StdlibSegment(") |
| 195 | + println(io, " base_info = ", format_base_info(base_info, uuid_constants), ",") |
| 196 | + println(io, " version_ranges = [") |
| 197 | + |
| 198 | + for (range_idx, ((start_v, end_v), ver)) in enumerate(version_ranges) |
| 199 | + ver_str = isnothing(ver) ? "nothing" : "v\"$(ver)\"" |
| 200 | + comma = range_idx < length(version_ranges) ? "," : "" |
| 201 | + if start_v == end_v |
| 202 | + println(io, " (v\"$(start_v)\", v\"$(start_v)\") => $(ver_str)$(comma)") |
| 203 | + else |
| 204 | + println(io, " (v\"$(start_v)\", v\"$(end_v)\") => $(ver_str)$(comma)") |
| 205 | + end |
| 206 | + end |
| 207 | + |
| 208 | + print(io, " ],") |
| 209 | + println(io) |
| 210 | + print(io, " )") |
| 211 | + println(io, seg_idx < length(segments) ? "," : "") |
| 212 | + end |
| 213 | + |
| 214 | + print(io, " ]") |
| 215 | + println(io, idx < length(sorted_uuids) ? "," : "") |
| 216 | + end |
| 217 | + |
| 218 | + println(io, ")") |
| 219 | + println(io) |
| 220 | + |
| 221 | + # Write UNREGISTERED_STDLIBS |
| 222 | + print(io, """ |
| 223 | + # Next, we also embed a list of stdlibs that must _always_ be treated as stdlibs, |
| 224 | + # because they cannot be resolved in the registry; they have only ever existed within |
| 225 | + # the Julia stdlib source tree, and because of that, trying to resolve them will fail. |
| 226 | + """) |
| 227 | + println(io, "const UNREGISTERED_STDLIBS = Dict{UUID,StdlibInfo}(") |
| 228 | + for (idx, (uuid, info)) in enumerate(sort(collect(unregistered_stdlibs), by=x->x[2].name)) |
| 229 | + uuid_str = format_uuid(uuid, uuid_constants) |
| 230 | + deps_str = isempty(info.deps) ? "UUID[]" : "UUID[" * join([format_uuid(d, uuid_constants) for d in info.deps], ", ") * "]" |
| 231 | + weakdeps_str = isempty(info.weakdeps) ? "UUID[]" : "UUID[" * join([format_uuid(d, uuid_constants) for d in info.weakdeps], ", ") * "]" |
| 232 | + ver_str = isnothing(info.version) ? "nothing" : "v\"$(info.version)\"" |
| 233 | + |
| 234 | + println(io, " $(uuid_str) => StdlibInfo(") |
| 235 | + println(io, " \"$(info.name)\",") |
| 236 | + println(io, " $(uuid_str),") |
| 237 | + println(io, " $(ver_str),") |
| 238 | + println(io, " $(deps_str),") |
| 239 | + println(io, " $(weakdeps_str),") |
| 240 | + println(io, " )", idx < length(unregistered_stdlibs) ? "," : "") |
| 241 | + end |
| 242 | + println(io, ")") |
| 243 | + end |
| 244 | +end |
0 commit comments