Skip to content

Commit

Permalink
HIVE-27714: Iceberg: metadata location overrides can cause data breac…
Browse files Browse the repository at this point in the history
…h - handling default locations. (#4880). (Ayush Saxena, reviewed by Denys Kuzmenko)
  • Loading branch information
ayushtkn authored Dec 1, 2023
1 parent 70f34e2 commit 33903b8
Show file tree
Hide file tree
Showing 6 changed files with 146 additions and 22 deletions.
4 changes: 4 additions & 0 deletions common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
Original file line number Diff line number Diff line change
Expand Up @@ -2226,6 +2226,10 @@ public static enum ConfVars {
HIVE_ICEBERG_EXPIRE_SNAPSHOT_NUMTHREADS("hive.iceberg.expire.snapshot.numthreads", 4,
"The number of threads to be used for deleting files during expire snapshot. If set to 0 or below it uses the" +
" defult DirectExecutorService"),

HIVE_ICEBERG_MASK_DEFAULT_LOCATION("hive.iceberg.mask.default.location", false,
"If this is set to true the URI for auth will have the default location masked with DEFAULT_TABLE_LOCATION"),

HIVEUSEEXPLICITRCFILEHEADER("hive.exec.rcfile.use.explicit.header", true,
"If this is set the header for RCFiles will simply be RCF. If this is not\n" +
"set the header will be that borrowed from sequence files, e.g. SEQ- followed\n" +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.common.type.Date;
import org.apache.hadoop.hive.common.type.SnapshotContext;
Expand Down Expand Up @@ -214,6 +215,8 @@ public class HiveIcebergStorageHandler implements HiveStoragePredicateHandler, H
public static final String MERGE_ON_READ = "merge-on-read";
public static final String STATS = "/stats/snap-";

public static final String TABLE_DEFAULT_LOCATION = "TABLE_DEFAULT_LOCATION";

/**
* Function template for producing a custom sort expression function:
* Takes the source column index and the bucket count to creat a function where Iceberg bucket UDF is used to build
Expand Down Expand Up @@ -520,11 +523,11 @@ private boolean writeColStats(ColumnStatistics tableColStats, Table tbl, String
writer.finish();
return true;
} catch (IOException e) {
LOG.warn("Unable to write stats to puffin file", e.getMessage());
LOG.warn("Unable to write stats to puffin file {}", e.getMessage());
return false;
}
} catch (InvalidObjectException | IOException e) {
LOG.warn("Unable to invalidate or merge stats: ", e.getMessage());
LOG.warn("Unable to invalidate or merge stats: {}", e.getMessage());
return false;
}
}
Expand Down Expand Up @@ -1053,19 +1056,19 @@ public URI getURIForAuth(org.apache.hadoop.hive.metastore.api.Table hmsTable) th
Optional<String> metadataLocation =
SessionStateUtil.getProperty(conf, BaseMetastoreTableOperations.METADATA_LOCATION_PROP);
if (metadataLocation.isPresent()) {
authURI.append(encodeString(metadataLocation.get()));
authURI.append(getPathForAuth(metadataLocation.get()));
} else {
Optional<String> locationProperty =
SessionStateUtil.getProperty(conf, hive_metastoreConstants.META_TABLE_LOCATION);
if (locationProperty.isPresent()) {
// this property is set during the create operation before the hive table was created
// we are returning a dummy iceberg metadata file
authURI.append(encodeString(URI.create(locationProperty.get()).getPath()))
authURI.append(getPathForAuth(locationProperty.get()))
.append(encodeString("/metadata/dummy.metadata.json"));
} else {
Table table = IcebergTableUtil.getTable(conf, hmsTable);
authURI.append(
encodeString(URI.create(((BaseTable) table).operations().current().metadataFileLocation()).getPath()));
authURI.append(getPathForAuth(((BaseTable) table).operations().current().metadataFileLocation(),
hmsTable.getSd().getLocation()));
}
}
LOG.debug("Iceberg storage handler authorization URI {}", authURI);
Expand All @@ -1080,6 +1083,39 @@ static String encodeString(String rawString) {
return HiveConf.EncoderDecoderFactory.URL_ENCODER_DECODER.encode(rawString);
}

String getPathForAuth(String locationProperty) {
return getPathForAuth(locationProperty,
SessionStateUtil.getProperty(conf, hive_metastoreConstants.DEFAULT_TABLE_LOCATION).orElse(null));
}

String getPathForAuth(String locationProperty, String defaultTableLocation) {
boolean maskDefaultLocation = conf.getBoolean(HiveConf.ConfVars.HIVE_ICEBERG_MASK_DEFAULT_LOCATION.varname,
HiveConf.ConfVars.HIVE_ICEBERG_MASK_DEFAULT_LOCATION.defaultBoolVal);
String location = URI.create(locationProperty).getPath();
if (!maskDefaultLocation || defaultTableLocation == null ||
!arePathsInSameFs(locationProperty, defaultTableLocation)) {
return encodeString(location);
}
try {
Path locationPath = new Path(location);
Path defaultLocationPath = locationPath.toUri().getScheme() != null ?
FileUtils.makeQualified(new Path(defaultTableLocation), conf) :
Path.getPathWithoutSchemeAndAuthority(new Path(defaultTableLocation));
return encodeString(location.replaceFirst(defaultLocationPath.toString(), TABLE_DEFAULT_LOCATION));
} catch (IOException e) {
throw new RuntimeException(e);
}
}

private boolean arePathsInSameFs(String locationProperty, String defaultTableLocation) {
try {
return FileUtils.equalsFileSystem(new Path(locationProperty).getFileSystem(conf),
new Path(defaultTableLocation).getFileSystem(conf));
} catch (IOException e) {
LOG.debug("Unable to get FileSystem for path {} and {}", locationProperty, defaultTableLocation);
return false;
}
}

@Override
public void validateSinkDesc(FileSinkDesc sinkDesc) throws SemanticException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@
import org.mockito.ArgumentCaptor;
import org.mockito.Mockito;

import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ICEBERG_MASK_DEFAULT_LOCATION;
import static org.apache.iceberg.TableProperties.GC_ENABLED;
import static org.apache.iceberg.types.Types.NestedField.optional;
import static org.junit.runners.Parameterized.Parameter;
Expand Down Expand Up @@ -1484,31 +1485,59 @@ public void testCommandsWithPartitionClauseThrow() {
}

@Test
public void testAuthzURI() throws TException, InterruptedException, URISyntaxException {
public void testAuthzURIMasked() throws TException, URISyntaxException, InterruptedException {
testAuthzURI(true);
}

@Test
public void testAuthzURIUnmasked() throws TException, URISyntaxException, InterruptedException {
testAuthzURI(false);
}

public void testAuthzURI(boolean masked) throws TException, InterruptedException, URISyntaxException {
TableIdentifier target = TableIdentifier.of("default", "target");
Table table = testTables.createTable(shell, target.name(), HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA,
PartitionSpec.unpartitioned(), FileFormat.PARQUET, ImmutableList.of());
org.apache.hadoop.hive.metastore.api.Table hmsTable = shell.metastore().getTable(target);

HiveIcebergStorageHandler storageHandler = new HiveIcebergStorageHandler();
shell.getHiveConf().setBoolean(HIVE_ICEBERG_MASK_DEFAULT_LOCATION.varname, masked);
storageHandler.setConf(shell.getHiveConf());
URI uriForAuth = storageHandler.getURIForAuth(hmsTable);

String metadataLocation =
storageHandler.getPathForAuth(((BaseTable) table).operations().current().metadataFileLocation(),
hmsTable.getSd().getLocation());

if (masked) {
Assert.assertTrue(metadataLocation.startsWith(HiveIcebergStorageHandler.TABLE_DEFAULT_LOCATION));
}

Assert.assertEquals("iceberg://" +
HiveIcebergStorageHandler.encodeString(target.namespace().toString()) + "/" +
HiveIcebergStorageHandler.encodeString(target.name()) + "?snapshot=" +
HiveIcebergStorageHandler.encodeString(
URI.create(((BaseTable) table).operations().current().metadataFileLocation()).getPath()),
URI.create(metadataLocation).getPath()),
uriForAuth.toString());

Assert.assertEquals("iceberg://" + target.namespace() + "/" + target.name() + "?snapshot=" +
URI.create(((BaseTable) table).operations().current().metadataFileLocation()).getPath(),
URI.create(metadataLocation).getPath(),
HiveConf.EncoderDecoderFactory.URL_ENCODER_DECODER.decode(uriForAuth.toString()));

}

@Test
public void testAuthzURIWithAuthEnabledWithMetadataLocation() throws HiveException {
public void testAuthzURIWithAuthEnabledWithMetadataLocationMasked() throws HiveException {
testAuthzURIWithAuthEnabledWithMetadataLocation(true);
}

@Test
public void testAuthzURIWithAuthEnabledWithMetadataLocationUnmasked() throws HiveException {
testAuthzURIWithAuthEnabledWithMetadataLocation(false);
}

public void testAuthzURIWithAuthEnabledWithMetadataLocation(boolean masked) throws HiveException {
shell.getHiveConf().setBoolean(HIVE_ICEBERG_MASK_DEFAULT_LOCATION.varname, masked);
shell.setHiveSessionValue("hive.security.authorization.enabled", true);
shell.setHiveSessionValue("hive.security.authorization.manager",
"org.apache.iceberg.mr.hive.CustomTestHiveAuthorizerFactory");
Expand Down Expand Up @@ -1540,7 +1569,21 @@ public void testAuthzURIWithAuthEnabledWithMetadataLocation() throws HiveExcepti
}

@Test
public void testAuthzURIWithAuthEnabledAndMockCommandAuthorizer() throws HiveException {
public void testAuthzURIWithAuthEnabledAndMockCommandAuthorizerMasked()
throws HiveException, TException, InterruptedException {
Assume.assumeTrue(testTableType.equals(TestTables.TestTableType.HIVE_CATALOG));
testAuthzURIWithAuthEnabledAndMockCommandAuthorizer(true);
}

@Test
public void testAuthzURIWithAuthEnabledAndMockCommandAuthorizerUnmasked()
throws HiveException, TException, InterruptedException {
testAuthzURIWithAuthEnabledAndMockCommandAuthorizer(false);
}

public void testAuthzURIWithAuthEnabledAndMockCommandAuthorizer(boolean masked)
throws HiveException, TException, InterruptedException {
shell.getHiveConf().setBoolean(HIVE_ICEBERG_MASK_DEFAULT_LOCATION.varname, masked);
shell.setHiveSessionValue("hive.security.authorization.enabled", true);
shell.setHiveSessionValue("hive.security.authorization.manager",
"org.apache.iceberg.mr.hive.CustomTestHiveAuthorizerFactory");
Expand All @@ -1554,8 +1597,18 @@ public void testAuthzURIWithAuthEnabledAndMockCommandAuthorizer() throws HiveExc
Optional<HivePrivilegeObject> hivePrivObject = outputHObjsCaptor.getValue().stream()
.filter(hpo -> hpo.getType().equals(HivePrivilegeObject.HivePrivilegeObjectType.STORAGEHANDLER_URI)).findAny();
if (hivePrivObject.isPresent()) {
org.apache.hadoop.hive.metastore.api.Table hmsTable = shell.metastore().getTable(target);
HiveIcebergStorageHandler storageHandler = new HiveIcebergStorageHandler();
storageHandler.setConf(shell.getHiveConf());
String metadataLocation = HiveConf.EncoderDecoderFactory.URL_ENCODER_DECODER.decode(
storageHandler.getPathForAuth(((BaseTable) table).operations().current().metadataFileLocation(),
hmsTable.getSd().getLocation()));

if (masked) {
Assert.assertTrue(metadataLocation.startsWith(HiveIcebergStorageHandler.TABLE_DEFAULT_LOCATION));
}
Assert.assertEquals("iceberg://" + target.namespace() + "/" + target.name() + "?snapshot=" +
new Path(((BaseTable) table).operations().current().metadataFileLocation()).getParent().toUri()
new Path(metadataLocation).getParent().toUri()
.getPath() +
"/dummy.metadata.json",
HiveConf.EncoderDecoderFactory.URL_ENCODER_DECODER.decode(hivePrivObject.get().getObjectName()));
Expand All @@ -1565,7 +1618,18 @@ public void testAuthzURIWithAuthEnabledAndMockCommandAuthorizer() throws HiveExc
}

@Test
public void testAuthzURIWithAuthEnabled() throws TException, InterruptedException, URISyntaxException {
public void testAuthzURIWithAuthEnabledMasked() throws TException, URISyntaxException, InterruptedException {
Assume.assumeTrue(testTableType.equals(TestTables.TestTableType.HIVE_CATALOG));
testAuthzURIWithAuthEnabled(true);
}

@Test
public void testAuthzURIWithAuthEnabledUnmasked() throws TException, URISyntaxException, InterruptedException {
testAuthzURIWithAuthEnabled(false);
}

public void testAuthzURIWithAuthEnabled(boolean masked) throws TException, InterruptedException, URISyntaxException {
shell.getHiveConf().setBoolean(HIVE_ICEBERG_MASK_DEFAULT_LOCATION.varname, masked);
shell.setHiveSessionValue("hive.security.authorization.enabled", true);
TableIdentifier target = TableIdentifier.of("default", "target");
Table table = testTables.createTable(shell, target.name(), HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA,
Expand All @@ -1575,9 +1639,15 @@ public void testAuthzURIWithAuthEnabled() throws TException, InterruptedExceptio
HiveIcebergStorageHandler storageHandler = new HiveIcebergStorageHandler();
storageHandler.setConf(shell.getHiveConf());
URI uriForAuth = storageHandler.getURIForAuth(hmsTable);
String metadataLocation =
storageHandler.getPathForAuth(((BaseTable) table).operations().current().metadataFileLocation(),
hmsTable.getSd().getLocation());

if (masked) {
Assert.assertTrue(metadataLocation.startsWith(HiveIcebergStorageHandler.TABLE_DEFAULT_LOCATION));
}
Assert.assertEquals("iceberg://" + target.namespace() + "/" + target.name() + "?snapshot=" +
URI.create(((BaseTable) table).operations().current()
.metadataFileLocation()).getPath(),
URI.create(metadataLocation).getPath(),
HiveConf.EncoderDecoderFactory.URL_ENCODER_DECODER.decode(uriForAuth.toString()));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import java.net.URISyntaxException;
import java.util.Collections;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.hive.common.TableName;
import org.apache.hadoop.hive.common.classification.InterfaceAudience;
import org.apache.hadoop.hive.common.classification.InterfaceStability;
import org.apache.hadoop.hive.common.type.SnapshotContext;
Expand Down
25 changes: 18 additions & 7 deletions ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVEARCHIVEENABLED;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_DEFAULT_STORAGE_HANDLER;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVESTATSDBCLASS;
import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.DEFAULT_TABLE_LOCATION;
import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_LOCATION;
import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_STORAGE;
import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.TABLE_IS_CTAS;
Expand Down Expand Up @@ -13800,6 +13801,11 @@ private Map<String, String> validateAndAddDefaultProperties(
addDbAndTabToOutputs(qualifiedTabName,
TableType.EXTERNAL_TABLE, isTemporaryTable, retValue, storageFormat);
}

if (isIcebergTable(retValue)) {
SessionStateUtil.addResourceOrThrow(conf, hive_metastoreConstants.DEFAULT_TABLE_LOCATION,
getDefaultLocation(qualifiedTabName[0], qualifiedTabName[1], true));
}
return retValue;
}

Expand Down Expand Up @@ -14211,13 +14217,7 @@ ASTNode analyzeCreateTable(
if (location != null) {
tblLocation = location;
} else {
try {
Warehouse wh = new Warehouse(conf);
tblLocation = wh.getDefaultTablePath(db.getDatabase(qualifiedTabName.getDb()), qualifiedTabName.getTable(),
isExt).toUri().getPath();
} catch (MetaException | HiveException e) {
throw new SemanticException(e);
}
tblLocation = getDefaultLocation(qualifiedTabName.getDb(), qualifiedTabName.getTable(), isExt);
}
try {
HiveStorageHandler storageHandler = HiveUtils.getStorageHandler(conf, storageFormat.getStorageHandler());
Expand Down Expand Up @@ -14376,6 +14376,17 @@ ASTNode analyzeCreateTable(
return null;
}

private String getDefaultLocation(String dbName, String tableName, boolean isExt) throws SemanticException {
String tblLocation;
try {
Warehouse wh = new Warehouse(conf);
tblLocation = wh.getDefaultTablePath(db.getDatabase(dbName), tableName, isExt).toUri().getPath();
} catch (MetaException | HiveException e) {
throw new SemanticException(e);
}
return tblLocation;
}

private static boolean isIcebergTable(Map<String, String> tblProps) {
return AlterTableConvertOperation.ConversionFormats.ICEBERG.properties().get(META_TABLE_STORAGE)
.equalsIgnoreCase(tblProps.get(META_TABLE_STORAGE));
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 33903b8

Please sign in to comment.