PARQUET-2171. tests all working

steveloughran · steveloughran · commit ea3eecff59ee · 2023-08-18T15:58:31.000+01:00
Testing found a couple of bugs with the initial Vector IO PR;
fixed ParquetFileReader

+ parameterized the tests which had failed, so as to ensure coverage
+ TestVectorIOBridge uses checksumfs, which does implement vector IO
  (which implies all file:// links get the speedup)
+ improved assertions in the failing tests, which were all reporting
  mismatches in data read expected via actual.

As well as these changes, I've also locally run the entire suite with
a core-site.xml set to turn on vector IO; this is how the regressions
were identified.
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
@@ -993,9 +993,6 @@ private ColumnChunkPageReadStore internalReadRowGroup(int blockIndex) throws IOE
     }
     // actually read all the chunks
     ChunkListBuilder builder = new ChunkListBuilder(block.getRowCount());
-    for (ConsecutivePartList consecutiveChunks : allParts) {
-      consecutiveChunks.readAll(f, builder);
-    }
     readAllPartsVectoredOrNormal(allParts, builder);
     for (Chunk chunk : builder.build()) {
       readChunkPages(chunk, block, rowGroup);
@@ -1953,6 +1950,8 @@ public void readFromVectoredRange(ParquetFileRange currRange,
         LOG.debug("Waiting for vectored read to finish for range {} ", currRange);
         buffer = BindingUtils.awaitFuture(currRange.getDataReadFuture(),
           HADOOP_VECTORED_READ_TIMEOUT_SECONDS, TimeUnit.SECONDS);
+        // report in a counter the data we just scanned
+        BenchmarkCounter.incrementBytesRead(currRange.getLength());
       } catch (TimeoutException e) {
         String error = String.format("Timeout while fetching result for %s", currRange);
         LOG.error(error, e);
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInputFormatColumnProjection.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInputFormatColumnProjection.java
@@ -42,15 +42,21 @@
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
 import java.util.UUID;
 
 import static java.lang.Thread.sleep;
 import static org.apache.parquet.schema.OriginalType.UTF8;
 import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
 
+@RunWith(Parameterized.class)
 public class TestInputFormatColumnProjection {
   public static final String FILE_CONTENT = "" +
       "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ," +
@@ -96,7 +102,19 @@ protected void map(Void key, Group value, Context context)
 
   @Rule
   public TemporaryFolder temp = new TemporaryFolder();
+  @Parameterized.Parameters(name = "vectored : {0}")
+  public static List<Boolean> params() {
+    return Arrays.asList(true, false);
+  }
 
+  /**
+   * Read type: true for vectored IO.
+   */
+  private final boolean readType;
+
+  public TestInputFormatColumnProjection(boolean readType) {
+    this.readType = readType;
+  }
   @Test
   public void testProjectionSize() throws Exception {
     Assume.assumeTrue( // only run this test for Hadoop 2
@@ -115,6 +133,8 @@ public void testProjectionSize() throws Exception {
     outputFolder.delete();
 
     Configuration conf = new Configuration();
+    // set the vector IO option
+    conf.setBoolean(ParquetInputFormat.HADOOP_VECTORED_IO_ENABLED, readType);
     // set the projection schema
     conf.set("parquet.read.schema", Types.buildMessage()
         .required(BINARY).as(UTF8).named("char")
@@ -164,7 +184,8 @@ public void testProjectionSize() throws Exception {
       bytesRead = Reader.bytesReadCounter.getValue();
     }
 
-    Assert.assertTrue("Should read less than 10% of the input file size",
+    Assert.assertTrue("Should read (" + bytesRead + " bytes)"
+        + " less than 10% of the input file size (" + bytesWritten + ")",
         bytesRead < (bytesWritten / 10));
   }
 
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java
@@ -125,25 +125,27 @@ public class TestParquetFileWriter {
 
   private String writeSchema;
 
-  private final String readType;
-
   @Rule
   public final TemporaryFolder temp = new TemporaryFolder();
 
-  @Parameterized.Parameters(name = "Read type : {0}")
-  public static List<String> params() {
-    return Arrays.asList("vectored", "normal");
+  @Parameterized.Parameters(name = "vectored : {0}")
+  public static List<Boolean> params() {
+    return Arrays.asList(true, false);
   }
 
+  /**
+   * Read type: true for vectored IO.
+   */
+  private final boolean readType;
 
-  public TestParquetFileWriter(String readType) {
+  public TestParquetFileWriter(boolean readType) {
     this.readType = readType;
   }
 
   private Configuration getTestConfiguration() {
     Configuration conf = new Configuration();
-    conf.set(ParquetInputFormat.HADOOP_VECTORED_IO_ENABLED,
-      String.valueOf(readType.equals("vectored")));
+    // set the vector IO option
+    conf.setBoolean(ParquetInputFormat.HADOOP_VECTORED_IO_ENABLED, readType);
     return conf;
   }
 
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/example/TestInputOutputFormat.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/example/TestInputOutputFormat.java
@@ -30,6 +30,7 @@
 import java.lang.reflect.Method;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -63,9 +64,13 @@
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.parquet.hadoop.util.ContextUtil;
 import org.apache.parquet.schema.MessageTypeParser;
+
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+@RunWith(Parameterized.class)
 public class TestInputOutputFormat {
   private static final Logger LOG = LoggerFactory.getLogger(TestInputOutputFormat.class);
 
@@ -82,9 +87,24 @@ public class TestInputOutputFormat {
   private Class<? extends Mapper<?,?,?,?>> readMapperClass;
   private Class<? extends Mapper<?,?,?,?>> writeMapperClass;
 
+  @Parameterized.Parameters(name = "vectored : {0}")
+  public static List<Boolean> params() {
+    return Arrays.asList(true, false);
+  }
+
+  /**
+   * Read type: true for vectored IO.
+   */
+  private final boolean readType;
+
+  public TestInputOutputFormat(boolean readType) {
+    this.readType = readType;
+  }
   @Before
   public void setUp() {
     conf = new Configuration();
+    // set the vector IO option
+    conf.setBoolean(ParquetInputFormat.HADOOP_VECTORED_IO_ENABLED, readType);
     writeSchema = "message example {\n" +
             "required int32 line;\n" +
             "required binary content;\n" +
@@ -335,8 +355,9 @@ public void testReadWriteWithCounter() throws Exception {
 
     assertTrue(value(readJob, "parquet", "bytesread") > 0L);
     assertTrue(value(readJob, "parquet", "bytestotal") > 0L);
-    assertTrue(value(readJob, "parquet", "bytesread")
-            == value(readJob, "parquet", "bytestotal"));
+    assertEquals("bytestotal != bytesread",
+      value(readJob, "parquet", "bytestotal"),
+      value(readJob, "parquet", "bytesread"));
     //not testing the time read counter since it could be zero due to the size of data is too small
   }
 
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/util/vectorio/TestVectorIOBridge.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/util/vectorio/TestVectorIOBridge.java
@@ -38,7 +38,6 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocalFileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.RawLocalFileSystem;
 import org.apache.hadoop.io.ByteBufferPool;
 import org.apache.hadoop.io.ElasticByteBufferPool;
 import org.apache.parquet.bytes.HeapByteBufferAllocator;
@@ -83,7 +82,7 @@ public class TestVectorIOBridge {
     return pool.getBuffer(false, value);
   };
 
-  private RawLocalFileSystem fileSystem;
+  private FileSystem fileSystem;
   private Path testFilePath;
 
   public TestVectorIOBridge() {
@@ -95,7 +94,7 @@ public void setUp() throws IOException {
     // skip the tests if the FileRangeBridge goes not load.
     assumeTrue("Bridge not available", FileRangeBridge.bridgeAvailable());
 
-    fileSystem = (RawLocalFileSystem) FileSystem.getLocal(new Configuration()).getRaw();
+    fileSystem = FileSystem.getLocal(new Configuration());
     testFilePath = fileSystem.makeQualified(vectoredPath);
     createFile(fileSystem, testFilePath, DATASET);
   }
@@ -107,7 +106,7 @@ public void tearDown() throws IOException {
     }
   }
 
-  public RawLocalFileSystem getFileSystem() {
+  public FileSystem getFileSystem() {
     return fileSystem;
   }