Skip to content

Commit

Permalink
PARQUET-2333: Support bzip2 and xz compression in the to-avro subcomm…
Browse files Browse the repository at this point in the history
…and (apache#1131)
  • Loading branch information
sekikn authored Aug 16, 2023
1 parent ef9929c commit f8465a2
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 13 deletions.
6 changes: 6 additions & 0 deletions parquet-cli/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,12 @@
<artifactId>avro</artifactId>
<version>${avro.version}</version>
</dependency>
<dependency>
<groupId>org.tukaani</groupId>
<artifactId>xz</artifactId>
<version>${tukaani.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.github.luben</groupId>
<artifactId>zstd-jni</artifactId>
Expand Down
32 changes: 19 additions & 13 deletions parquet-cli/src/main/java/org/apache/parquet/cli/util/Codecs.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.apache.parquet.cli.util;

import org.apache.avro.AvroRuntimeException;
import org.apache.avro.file.CodecFactory;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;

Expand All @@ -34,19 +35,24 @@ public static CompressionCodecName parquetCodec(String codec) {
}

public static CodecFactory avroCodec(String codec) {
CompressionCodecName parquetCodec = parquetCodec(codec);
switch (parquetCodec) {
case UNCOMPRESSED:
return CodecFactory.nullCodec();
case SNAPPY:
return CodecFactory.snappyCodec();
case GZIP:
return CodecFactory.deflateCodec(9);
case ZSTD:
return CodecFactory.zstandardCodec(CodecFactory.DEFAULT_ZSTANDARD_LEVEL);
default:
throw new IllegalArgumentException(
"Codec incompatible with Avro: " + codec);
String avroCodec;
if (codec.equalsIgnoreCase(CompressionCodecName.GZIP.name())) {
avroCodec = "deflate";
} else if (codec.equalsIgnoreCase(CompressionCodecName.SNAPPY.name())) {
avroCodec = "snappy";
} else if (codec.equalsIgnoreCase(CompressionCodecName.UNCOMPRESSED.name())) {
avroCodec = "null";
} else if (codec.equalsIgnoreCase(CompressionCodecName.ZSTD.name())) {
avroCodec = "zstandard";
} else {
avroCodec = codec;
}
CodecFactory factory;
try {
factory = CodecFactory.fromString(avroCodec);
} catch (AvroRuntimeException e) {
throw new IllegalArgumentException("Codec incompatible with Avro: " + codec, e);
}
return factory;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,18 @@ public void testToAvroCommandWithZstdCompression() throws IOException {
Assert.assertTrue(avroFile.exists());
}

@Test
public void testToAvroCommandWithBzip2Compression() throws IOException {
File avroFile = toAvro(parquetFile(), "bzip2");
Assert.assertTrue(avroFile.exists());
}

@Test
public void testToAvroCommandWithXzCompression() throws IOException {
File avroFile = toAvro(parquetFile(), "xz");
Assert.assertTrue(avroFile.exists());
}

@Test(expected = IllegalArgumentException.class)
public void testToAvroCommandWithInvalidCompression() throws IOException {
toAvro(parquetFile(), "FOO");
Expand Down
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
<!-- parquet-cli dependencies -->
<opencsv.version>2.3</opencsv.version>
<jcommander.version>1.82</jcommander.version>
<tukaani.version>1.9</tukaani.version>
<zstd-jni.version>1.5.0-1</zstd-jni.version>
<commons-text.version>1.8</commons-text.version>
<jsr305.version>3.0.2</jsr305.version>
Expand Down

0 comments on commit f8465a2

Please sign in to comment.