|
20 | 20 |
|
21 | 21 | import static org.apache.parquet.column.ParquetProperties.DEFAULT_ADAPTIVE_BLOOM_FILTER_ENABLED;
|
22 | 22 | import static org.apache.parquet.column.ParquetProperties.DEFAULT_BLOOM_FILTER_ENABLED;
|
| 23 | +import static org.apache.parquet.column.ParquetProperties.DEFAULT_PAGE_PATH_OUTPUT_COMMITTER_ENABLED; |
23 | 24 | import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
|
24 | 25 | import static org.apache.parquet.hadoop.util.ContextUtil.getConfiguration;
|
25 | 26 |
|
|
98 | 99 | * </pre>
|
99 | 100 | * <p>
|
100 | 101 | * if none of those is set the data is uncompressed.
|
101 |
| - * |
| 102 | + * <p> |
| 103 | + * This class also generates the committer required to manifest the work in the |
| 104 | + * destination directory if and when the job is committed. |
| 105 | + * This has historically always created an instance of {@link ParquetOutputCommitter}. |
| 106 | + * If {@link #PAGE_PATH_OUTPUT_COMMITTER_ENABLED} is true, the superclass is used |
| 107 | + * to create the committer, which on Hadoop 3.1 and later involves the |
| 108 | + * {@code PathOutputCommitterFactory} mechanism to dynamically choose a committer |
| 109 | + * for the target filesystem. Such committers do not generated summary files. |
102 | 110 | * @param <T> the type of the materialized records
|
103 | 111 | */
|
104 | 112 | public class ParquetOutputFormat<T> extends FileOutputFormat<Void, T> {
|
@@ -158,6 +166,13 @@ public static enum JobSummaryLevel {
|
158 | 166 | public static final String PAGE_ROW_COUNT_LIMIT = "parquet.page.row.count.limit";
|
159 | 167 | public static final String PAGE_WRITE_CHECKSUM_ENABLED = "parquet.page.write-checksum.enabled";
|
160 | 168 |
|
| 169 | + /** |
| 170 | + * Use the output committer created by the superclass, rather than a {@link ParquetOutputCommitter}. |
| 171 | + * This delivers correctness and scalability on cloud storage, but will not write schema files. |
| 172 | + * Value: {@value}. |
| 173 | + */ |
| 174 | + public static final String PAGE_PATH_OUTPUT_COMMITTER_ENABLED = "parquet.path.outputcommitter.enabled"; |
| 175 | + |
161 | 176 | public static JobSummaryLevel getJobSummaryLevel(Configuration conf) {
|
162 | 177 | String level = conf.get(JOB_SUMMARY_LEVEL);
|
163 | 178 | String deprecatedFlag = conf.get(ENABLE_JOB_SUMMARY);
|
@@ -390,7 +405,7 @@ public static boolean getPageWriteChecksumEnabled(Configuration conf) {
|
390 | 405 | }
|
391 | 406 |
|
392 | 407 | private WriteSupport<T> writeSupport;
|
393 |
| - private ParquetOutputCommitter committer; |
| 408 | + private OutputCommitter committer; |
394 | 409 |
|
395 | 410 | /**
|
396 | 411 | * constructor used when this OutputFormat in wrapped in another one (In Pig for example)
|
@@ -555,7 +570,22 @@ public WriteSupport<T> getWriteSupport(Configuration configuration) {
|
555 | 570 | public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException {
|
556 | 571 | if (committer == null) {
|
557 | 572 | Path output = getOutputPath(context);
|
558 |
| - committer = new ParquetOutputCommitter(output, context); |
| 573 | + final Configuration conf = context.getConfiguration(); |
| 574 | + if (conf.getBoolean(PAGE_PATH_OUTPUT_COMMITTER_ENABLED, DEFAULT_PAGE_PATH_OUTPUT_COMMITTER_ENABLED)) { |
| 575 | + // hand off creation of a committer to superclass. |
| 576 | + // On hadoop 3.1+ this will use a factory mechanism to dynamically |
| 577 | + // bind to a filesystem specific committer, an explict override |
| 578 | + // or fall back to the classic FileOutputCommitter |
| 579 | + committer = super.getOutputCommitter(context); |
| 580 | + LOG.debug("Writing to {} with output committer {}", committer, output); |
| 581 | + |
| 582 | + if (ParquetOutputFormat.getJobSummaryLevel(conf) != JobSummaryLevel.NONE) { |
| 583 | + // warn if summary file generation has been requested, as they won't be created. |
| 584 | + LOG.warn("Committer {} does not support summary files", committer); |
| 585 | + } |
| 586 | + } else { |
| 587 | + committer = new ParquetOutputCommitter(output, context); |
| 588 | + } |
559 | 589 | }
|
560 | 590 | return committer;
|
561 | 591 | }
|
|
0 commit comments