Skip to content

Commit e76f134

Browse files
committed
Fix NPE in RestoreService when shard size is unavailable
Signed-off-by: Sotaro Hikita <[email protected]>
1 parent c7503fc commit e76f134

File tree

3 files changed

+80
-6
lines changed

3 files changed

+80
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
3535
- Fix issue with updating core with a patch number other than 0 ([#19377](https://github.com/opensearch-project/OpenSearch/pull/19377))
3636
- [Java Agent] Allow JRT protocol URLs in protection domain extraction ([#19683](https://github.com/opensearch-project/OpenSearch/pull/19683))
3737
- Fix potential concurrent modification exception when updating allocation filters ([#19701])(https://github.com/opensearch-project/OpenSearch/pull/19701))
38+
- Fix NullPointerException when restoring remote snapshot with missing shard size information ([#19684](https://github.com/opensearch-project/OpenSearch/pull/19684))
3839

3940
### Dependencies
4041
- Update to Gradle 9.1 ([#19575](https://github.com/opensearch-project/OpenSearch/pull/19575))

server/src/internalClusterTest/java/org/opensearch/snapshots/SearchableSnapshotIT.java

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,4 +1054,61 @@ private void assertSearchableSnapshotIndexDirectoryExistence(String nodeName, In
10541054
assertTrue("index cache path should " + (exists ? "exist" : "not exist"), Files.exists(indexPath) == exists);
10551055
}, 30, TimeUnit.SECONDS);
10561056
}
1057+
1058+
public void testRestoreRemoteSnapshotWithNullShardSizes() throws Exception {
1059+
final String snapshotName1 = "test-snap-1";
1060+
final String snapshotName2 = "test-snap-2";
1061+
final String repoName = "test-repo";
1062+
final String indexName1 = "test-idx-1";
1063+
final String indexName2 = "test-idx-2";
1064+
final Client client = client();
1065+
1066+
// Setup cluster with delayed ClusterInfo updates to simulate null shard sizes
1067+
client.admin()
1068+
.cluster()
1069+
.prepareUpdateSettings()
1070+
.setPersistentSettings(Settings.builder().put("cluster.info.update.interval", "60m"))
1071+
.get();
1072+
1073+
internalCluster().ensureAtLeastNumDataNodes(2);
1074+
1075+
createIndexWithDocsAndEnsureGreen(0, 50, indexName1);
1076+
createRepositoryWithSettings(null, repoName);
1077+
takeSnapshot(client, snapshotName1, repoName, indexName1);
1078+
deleteIndicesAndEnsureGreen(client, indexName1);
1079+
1080+
createIndexWithDocsAndEnsureGreen(0, 50, indexName2);
1081+
takeSnapshot(client, snapshotName2, repoName, indexName2);
1082+
deleteIndicesAndEnsureGreen(client, indexName2);
1083+
1084+
internalCluster().ensureAtLeastNumWarmNodes(2);
1085+
1086+
RestoreSnapshotRequest firstRestore = new RestoreSnapshotRequest(repoName, snapshotName1).indices(indexName1)
1087+
.storageType(RestoreSnapshotRequest.StorageType.REMOTE_SNAPSHOT)
1088+
.renamePattern("(.+)")
1089+
.renameReplacement("remote-$1")
1090+
.waitForCompletion(true);
1091+
1092+
client.admin().cluster().restoreSnapshot(firstRestore).get();
1093+
ensureGreen("remote-" + indexName1);
1094+
1095+
// Second restore immediately after - ClusterInfo won't have size data for first restore shards
1096+
RestoreSnapshotRequest secondRestore = new RestoreSnapshotRequest(repoName, snapshotName2).indices(indexName2)
1097+
.storageType(RestoreSnapshotRequest.StorageType.REMOTE_SNAPSHOT)
1098+
.renamePattern("(.+)")
1099+
.renameReplacement("remote-$1")
1100+
.waitForCompletion(true);
1101+
1102+
client.admin().cluster().restoreSnapshot(secondRestore).get();
1103+
ensureGreen("remote-" + indexName2);
1104+
1105+
assertDocCount("remote-" + indexName1, 50L);
1106+
assertDocCount("remote-" + indexName2, 50L);
1107+
1108+
client.admin()
1109+
.cluster()
1110+
.prepareUpdateSettings()
1111+
.setPersistentSettings(Settings.builder().putNull("cluster.info.update.interval"))
1112+
.get();
1113+
}
10571114
}

server/src/main/java/org/opensearch/snapshots/RestoreService.java

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -909,13 +909,29 @@ private void validateSearchableSnapshotRestorable(long totalRestorableRemoteInde
909909
.routingTable()
910910
.allShardsSatisfyingPredicate(isRemoteSnapshotShard);
911911

912-
long totalRestoredRemoteIndexesSize = shardsIterator.getShardRoutings()
913-
.stream()
914-
.map(clusterInfo::getShardSize)
915-
.mapToLong(Long::longValue)
916-
.sum();
912+
long totalRestoredRemoteIndicesSize = 0;
913+
int missingSizeCount = 0;
914+
List<ShardRouting> routings = shardsIterator.getShardRoutings();
915+
916+
for (ShardRouting shardRouting : routings) {
917+
Long shardSize = clusterInfo.getShardSize(shardRouting);
918+
if (shardSize != null) {
919+
totalRestoredRemoteIndicesSize += shardSize;
920+
} else {
921+
missingSizeCount++;
922+
}
923+
}
924+
925+
if (missingSizeCount > 0) {
926+
logger.warn(
927+
"Size information unavailable for {} out of {} remote snapshot shards. "
928+
+ "File cache validation will use available data only.",
929+
missingSizeCount,
930+
routings.size()
931+
);
932+
}
917933

918-
if (totalRestoredRemoteIndexesSize + totalRestorableRemoteIndexesSize > remoteDataToFileCacheRatio
934+
if (totalRestoredRemoteIndicesSize + totalRestorableRemoteIndexesSize > remoteDataToFileCacheRatio
919935
* totalNodeFileCacheSize) {
920936
throw new SnapshotRestoreException(
921937
snapshot,

0 commit comments

Comments
 (0)