Skip to content

Commit a25e1dc

Browse files
author
wuliang
committed
fix: task ID of image blobs
Signed-off-by: [email protected] <wuliang>
1 parent 118416e commit a25e1dc

File tree

5 files changed

+170
-3
lines changed

5 files changed

+170
-3
lines changed

dragonfly-client-config/src/dfdaemon.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,6 +1105,12 @@ impl Default for Rule {
11051105
}
11061106
}
11071107

1108+
/// default_enable_task_id_based_blob_digest is the default value for enable_task_id_based_blob_digest.
1109+
#[inline]
1110+
fn default_enable_task_id_based_blob_digest() -> bool {
1111+
true
1112+
}
1113+
11081114
/// RegistryMirror is the registry mirror configuration.
11091115
#[derive(Debug, Clone, Validate, Deserialize)]
11101116
#[serde(default, rename_all = "camelCase")]
@@ -1120,6 +1126,14 @@ pub struct RegistryMirror {
11201126
/// If registry use self-signed cert, the client should set the
11211127
/// cert for the registry mirror.
11221128
pub cert: Option<PathBuf>,
1129+
1130+
/// enable_task_id_based_blob_digest indicates whether to calculate the task ID based on the blob's SHA256 digest
1131+
/// for OCI registry blob download URLs. When enabled, if the download URL is for an image blob
1132+
/// (e.g., /v2/<name>/blobs/sha256:<digest>), the task ID will be calculated based on the blob's digest
1133+
/// instead of the full URL. This allows the same blob from different registries to share the same task ID,
1134+
/// avoiding redundant downloads and storage.
1135+
#[serde(default = "default_enable_task_id_based_blob_digest")]
1136+
pub enable_task_id_based_blob_digest: bool,
11231137
}
11241138

11251139
/// RegistryMirror implements Default.
@@ -1128,6 +1142,7 @@ impl Default for RegistryMirror {
11281142
Self {
11291143
addr: default_proxy_registry_mirror_addr(),
11301144
cert: None,
1145+
enable_task_id_based_blob_digest: default_enable_task_id_based_blob_digest(),
11311146
}
11321147
}
11331148
}

dragonfly-client-util/src/id_generator/mod.rs

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,31 @@ const SEED_PEER_SUFFIX: &str = "seed";
3131
/// PERSISTENT_CACHE_TASK_SUFFIX is the suffix of the persistent cache task.
3232
const PERSISTENT_CACHE_TASK_SUFFIX: &str = "persistent-cache-task";
3333

34+
/// extract_blob_digest_from_url extracts the blob digest from a registry blob URL.
35+
/// Returns the digest if the URL is a blob URL, otherwise returns None.
36+
///
37+
/// Example blob URLs:
38+
/// - /v2/<name>/blobs/sha256:<digest>
39+
/// - /v2/<namespace>/<repo>/blobs/sha256:<digest>
40+
pub fn extract_blob_digest_from_url(path: &str) -> Option<String> {
41+
// Check if the path contains /blobs/sha256:
42+
if let Some(blobs_idx) = path.find("/blobs/sha256:") {
43+
// Extract everything after "/blobs/sha256:"
44+
let after_blobs = &path[blobs_idx + "/blobs/sha256:".len()..];
45+
46+
// The digest should be 64 hex characters (SHA256)
47+
// It might be followed by query parameters or nothing
48+
let digest = after_blobs.split(&['?', '#'][..]).next().unwrap_or("");
49+
50+
// Validate that it looks like a SHA256 hash (64 hex characters)
51+
if digest.len() == 64 && digest.chars().all(|c| c.is_ascii_hexdigit()) {
52+
return Some(format!("sha256:{}", digest));
53+
}
54+
}
55+
56+
None
57+
}
58+
3459
/// TaskIDParameter is the parameter of the task id.
3560
pub enum TaskIDParameter {
3661
/// Content uses the content to generate the task id.
@@ -44,6 +69,11 @@ pub enum TaskIDParameter {
4469
application: Option<String>,
4570
filtered_query_params: Vec<String>,
4671
},
72+
/// BlobDigestBased uses the blob digest to generate the task id.
73+
/// The digest can use other algorithms (like sha256, sha512, etc.),
74+
/// but the task ID in Dragonfly must be SHA256.
75+
/// Task ID needs to compute a SHA256 hash based on the digest content.
76+
BlobDigestBased(String),
4777
}
4878

4979
/// PersistentCacheTaskIDParameter is the parameter of the persistent cache task id.
@@ -153,6 +183,12 @@ impl IDGenerator {
153183
// Generate the task id.
154184
Ok(hex::encode(hasher.finalize()))
155185
}
186+
TaskIDParameter::BlobDigestBased(digest) => {
187+
// The digest can use other algorithms (sha256, sha512, etc.),
188+
// but the task ID in Dragonfly must be SHA256.
189+
// Compute SHA256 hash based on the digest content.
190+
Ok(hex::encode(Sha256::digest(digest.as_bytes())))
191+
}
156192
}
157193
}
158194

@@ -335,6 +371,14 @@ mod tests {
335371
TaskIDParameter::Content("This is a test file".to_string()),
336372
"e2d0fe1585a63ec6009c8016ff8dda8b17719a637405a4e23c0ff81339148249",
337373
),
374+
(
375+
IDGenerator::new("127.0.0.1".to_string(), "localhost".to_string(), false),
376+
TaskIDParameter::BlobDigestBased(
377+
"sha256:1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef"
378+
.to_string(),
379+
),
380+
"719498689c2f5bd76140f3bd2b03bcbc3134890e72b4d5b788d8b41ec0cd0f93",
381+
),
338382
];
339383

340384
for (generator, parameter, expected_id) in test_cases {
@@ -343,6 +387,42 @@ mod tests {
343387
}
344388
}
345389

390+
#[test]
391+
fn test_extract_blob_digest_from_url() {
392+
// Test cases for valid blob URLs
393+
let test_cases = vec![
394+
(
395+
"/v2/library/nginx/blobs/sha256:1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef",
396+
Some("sha256:1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef".to_string()),
397+
),
398+
(
399+
"/v2/myorg/myrepo/blobs/sha256:abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890",
400+
Some("sha256:abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890".to_string()),
401+
),
402+
(
403+
"/v2/namespace/repo/blobs/sha256:0000000000000000000000000000000000000000000000000000000000000000?query=param",
404+
Some("sha256:0000000000000000000000000000000000000000000000000000000000000000".to_string()),
405+
),
406+
];
407+
408+
for (input, expected) in test_cases {
409+
assert_eq!(extract_blob_digest_from_url(input), expected);
410+
}
411+
412+
// Test cases for invalid URLs (should return None)
413+
let invalid_cases = vec![
414+
"/v2/library/nginx/manifests/latest",
415+
"/v2/library/nginx/blobs/sha256:short",
416+
"/v2/library/nginx/blobs/sha256:xyz", // Not hex
417+
"/api/v1/some/other/path",
418+
"",
419+
];
420+
421+
for input in invalid_cases {
422+
assert_eq!(extract_blob_digest_from_url(input), None);
423+
}
424+
}
425+
346426
#[test]
347427
fn should_generate_persistent_cache_task_id() {
348428
let dir = tempdir().unwrap();

dragonfly-client/src/grpc/dfdaemon_download.rs

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,33 @@ impl DfdaemonDownload for DfdaemonDownloadServerHandler {
263263
.task
264264
.id_generator
265265
.task_id(match download.content_for_calculating_task_id.clone() {
266-
Some(content) => TaskIDParameter::Content(content),
266+
Some(content) => {
267+
// Check if the content matches OCI digest format: algorithm:encoded
268+
// See: https://github.com/opencontainers/image-spec/blob/main/descriptor.md#digests
269+
// Format: algorithm can be [a-z0-9+._-]+, encoded can be [a-zA-Z0-9=_-]+
270+
// If it's a digest, use BlobDigestBased to ensure SHA256 hash is calculated
271+
// from the digest content, regardless of the digest algorithm used.
272+
let is_digest = content.split_once(':').is_some_and(|(alg, enc)| {
273+
// Validate algorithm: [a-z0-9+._-]+
274+
!alg.is_empty()
275+
&& alg.chars().all(|c| {
276+
c.is_ascii_lowercase()
277+
|| c.is_ascii_digit()
278+
|| matches!(c, '+' | '.' | '_' | '-')
279+
})
280+
// Validate encoded: [a-zA-Z0-9=_-]+ and minimum length
281+
&& enc.len() >= 32
282+
&& enc.chars().all(|c| {
283+
c.is_ascii_alphanumeric() || matches!(c, '=' | '_' | '-')
284+
})
285+
});
286+
287+
if is_digest {
288+
TaskIDParameter::BlobDigestBased(content)
289+
} else {
290+
TaskIDParameter::Content(content)
291+
}
292+
}
267293
None => TaskIDParameter::URLBased {
268294
url: download.url.clone(),
269295
piece_length: download.piece_length,

dragonfly-client/src/grpc/dfdaemon_upload.rs

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,33 @@ impl DfdaemonUpload for DfdaemonUploadServerHandler {
260260
.task
261261
.id_generator
262262
.task_id(match download.content_for_calculating_task_id.clone() {
263-
Some(content) => TaskIDParameter::Content(content),
263+
Some(content) => {
264+
// Check if the content matches OCI digest format: algorithm:encoded
265+
// See: https://github.com/opencontainers/image-spec/blob/main/descriptor.md#digests
266+
// Format: algorithm can be [a-z0-9+._-]+, encoded can be [a-zA-Z0-9=_-]+
267+
// If it's a digest, use BlobDigestBased to ensure SHA256 hash is calculated
268+
// from the digest content, regardless of the digest algorithm used.
269+
let is_digest = content.split_once(':').is_some_and(|(alg, enc)| {
270+
// Validate algorithm: [a-z0-9+._-]+
271+
!alg.is_empty()
272+
&& alg.chars().all(|c| {
273+
c.is_ascii_lowercase()
274+
|| c.is_ascii_digit()
275+
|| matches!(c, '+' | '.' | '_' | '-')
276+
})
277+
// Validate encoded: [a-zA-Z0-9=_-]+ and minimum length
278+
&& enc.len() >= 32
279+
&& enc.chars().all(|c| {
280+
c.is_ascii_alphanumeric() || matches!(c, '=' | '_' | '-')
281+
})
282+
});
283+
284+
if is_digest {
285+
TaskIDParameter::BlobDigestBased(content)
286+
} else {
287+
TaskIDParameter::Content(content)
288+
}
289+
}
264290
None => TaskIDParameter::URLBased {
265291
url: download.url.clone(),
266292
piece_length: download.piece_length,

dragonfly-client/src/proxy/mod.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ use dragonfly_client_metric::{
3131
};
3232
use dragonfly_client_util::{
3333
http::{hashmap_to_headermap, headermap_to_hashmap},
34+
id_generator::extract_blob_digest_from_url,
3435
shutdown,
3536
tls::{generate_self_signed_certs_by_ca_cert, generate_simple_self_signed_certs, NoVerifier},
3637
};
@@ -1121,6 +1122,25 @@ fn make_download_task_request(
11211122
}
11221123
}
11231124

1125+
// Determine the content for calculating task ID.
1126+
// Priority:
1127+
// 1. Explicit header value (X-Dragonfly-Content-For-Calculating-Task-ID)
1128+
// 2. Blob digest from URL (if enable_task_id_based_blob_digest is true)
1129+
// 3. None (will use URL-based calculation)
1130+
let content_for_calculating_task_id = header::get_content_for_calculating_task_id(&header)
1131+
.or_else(|| {
1132+
if config
1133+
.proxy
1134+
.registry_mirror
1135+
.enable_task_id_based_blob_digest
1136+
{
1137+
let path = request.uri().path();
1138+
extract_blob_digest_from_url(path)
1139+
} else {
1140+
None
1141+
}
1142+
});
1143+
11241144
Ok(DownloadTaskRequest {
11251145
download: Some(Download {
11261146
url: make_download_url(request.uri(), rule.use_tls, rule.redirect.clone())?,
@@ -1149,7 +1169,7 @@ fn make_download_task_request(
11491169
is_prefetch: false,
11501170
need_piece_content: false,
11511171
force_hard_link: header::get_force_hard_link(&header),
1152-
content_for_calculating_task_id: header::get_content_for_calculating_task_id(&header),
1172+
content_for_calculating_task_id,
11531173
remote_ip: Some(remote_ip.to_string()),
11541174
concurrent_piece_count: Some(config.download.concurrent_piece_count),
11551175
overwrite: false,

0 commit comments

Comments
 (0)