diff --git a/core-plugins/pom.xml b/core-plugins/pom.xml index fcbf95697..70644a501 100644 --- a/core-plugins/pom.xml +++ b/core-plugins/pom.xml @@ -279,12 +279,6 @@ org.mockito mockito-core - - io.cdap.plugin - format-xls - ${project.version} - test - io.cdap.plugin format-xls diff --git a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormat.java b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormat.java index 155f75bec..0b79787f6 100644 --- a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormat.java +++ b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormat.java @@ -16,7 +16,6 @@ package io.cdap.plugin.format.xls.input; -import com.google.common.base.Preconditions; import io.cdap.cdap.api.data.format.StructuredRecord; import io.cdap.cdap.api.data.schema.Schema; import io.cdap.plugin.format.input.PathTrackingInputFormat; @@ -48,7 +47,7 @@ */ public class XlsInputFormat extends PathTrackingInputFormat { - public static final String SHEET_NO = "Sheet Number"; + public static final String SHEET_NUM = "Sheet Number"; public static final String SHEET_VALUE = "sheetValue"; public static final String NAME_SKIP_HEADER = "skipHeader"; public static final String TERMINATE_IF_EMPTY_ROW = "terminateIfEmptyRow"; @@ -58,10 +57,10 @@ protected RecordReader createRecordReade FileSplit split, TaskAttemptContext context, @Nullable String pathField, @Nullable Schema schema) throws IOException { Configuration jobConf = context.getConfiguration(); - boolean skipFirstRow = jobConf.getBoolean(NAME_SKIP_HEADER, true); + boolean skipFirstRow = jobConf.getBoolean(NAME_SKIP_HEADER, false); boolean terminateIfEmptyRow = jobConf.getBoolean(TERMINATE_IF_EMPTY_ROW, false); - Schema outputSchema = schema != null ? Schema.parseJson(context.getConfiguration().get("schema")) : null; - String sheet = jobConf.get(SHEET_NO); + Schema outputSchema = schema != null ? Schema.parseJson(context.getConfiguration().get("schema")) : null; + String sheet = jobConf.get(SHEET_NUM); String sheetValue = jobConf.get(SHEET_VALUE, "0"); return new XlsRecordReader(sheet, sheetValue, outputSchema, terminateIfEmptyRow, skipFirstRow); } diff --git a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatProvider.java b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatProvider.java index 4b98cf74f..bedcb48a3 100644 --- a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatProvider.java +++ b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatProvider.java @@ -43,7 +43,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; -import java.util.Objects; import javax.annotation.Nullable; @@ -84,12 +83,7 @@ public void validate(FormatContext context) { if (!conf.containsMacro(XlsInputFormatConfig.NAME_SHEET_VALUE) && conf.getSheet().equals(XlsInputFormatConfig.SHEET_NUMBER) && !Strings.isNullOrEmpty(conf.getSheetValue())) { - try { - Integer.parseInt(conf.getSheetValue()); - } catch (NumberFormatException e) { - collector.addFailure("Sheet number must be a number.", null) - .withConfigProperty(XlsInputFormatConfig.NAME_SHEET_VALUE); - } + getSheetAsNumber(collector); } if (!conf.containsMacro(PathTrackingConfig.NAME_SCHEMA) && schema == null && context.getInputSchema() == null) { collector.addFailure("XLS format cannot be used without specifying a schema.", "Schema must be specified.") @@ -99,7 +93,7 @@ public void validate(FormatContext context) { @Override protected void addFormatProperties(Map properties) { - properties.put(XlsInputFormat.SHEET_NO, conf.getSheet()); + properties.put(XlsInputFormat.SHEET_NUM, conf.getSheet()); if (!Strings.isNullOrEmpty(conf.getSheetValue())) { properties.put(XlsInputFormat.SHEET_VALUE, conf.getSheetValue()); } @@ -122,15 +116,9 @@ public Schema detectSchema(FormatContext context, InputFiles inputFiles) throws Sheet workSheet; // Check if user wants to access with name or number if (conf.getSheet() != null && conf.getSheet().equals(XlsInputFormatConfig.SHEET_NUMBER)) { - int sheetValue = 0; - if (!Strings.isNullOrEmpty(conf.getSheetValue())) { - try { - sheetValue = Integer.parseInt(conf.getSheetValue()); - } catch (NumberFormatException e) { - failureCollector.addFailure("Sheet number must be a number.", null) - .withConfigProperty(XlsInputFormatConfig.NAME_SHEET_VALUE); - return null; - } + Integer sheetValue = getSheetAsNumber(failureCollector); + if (sheetValue == null) { + return null; } workSheet = workbook.getSheetAt(sheetValue); } else { @@ -204,4 +192,20 @@ public Schema detectSchema(FormatContext context, InputFiles inputFiles) throws return null; } + private Integer getSheetAsNumber(FailureCollector failureCollector) { + if (!Strings.isNullOrEmpty(conf.getSheetValue())) { + try { + int sheetValue = Integer.parseInt(conf.getSheetValue()); + if (sheetValue >= 0) { + return sheetValue; + } + failureCollector.addFailure("Sheet number must be a positive number.", null) + .withConfigProperty(XlsInputFormatConfig.NAME_SHEET_VALUE); + } catch (NumberFormatException e) { + failureCollector.addFailure("Sheet number must be a number.", null) + .withConfigProperty(XlsInputFormatConfig.NAME_SHEET_VALUE); + } + } + return null; + } } diff --git a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsRowConverter.java b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsRowConverter.java index 2d0ee046f..c7e8a8d8b 100644 --- a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsRowConverter.java +++ b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsRowConverter.java @@ -26,6 +26,7 @@ import org.apache.poi.ss.usermodel.Row; import java.util.List; +import javax.annotation.Nullable; /** * Converts a row of XLS cells to a StructuredRecord. @@ -42,6 +43,7 @@ public class XlsRowConverter { * Converts a row of XLS cells to a StructuredRecord. * Returns null if the row is null or empty. */ + @Nullable public StructuredRecord.Builder convert(Row row, Schema outputSchema) { if (row == null) { return null; @@ -49,12 +51,7 @@ public StructuredRecord.Builder convert(Row row, Schema outputSchema) { boolean isRowEmpty = true; StructuredRecord.Builder builder = StructuredRecord.builder(outputSchema); List fields = outputSchema.getFields(); - for (int cellIndex = 0; cellIndex < row.getLastCellNum(); cellIndex++) { - if (cellIndex >= fields.size()) { - throw new IllegalArgumentException( - String.format("Schema contains fewer fields than the number of columns in the excel file. " + - "Schema fields: %s, Excel columns: %s", fields.size(), row.getLastCellNum())); - } + for (int cellIndex = 0; cellIndex < row.getLastCellNum() && cellIndex < fields.size(); cellIndex++) { Cell cell = row.getCell(cellIndex, Row.MissingCellPolicy.RETURN_BLANK_AS_NULL); if (cell == null) { // Blank cells are skipped, builder will set null for the field, no processing needed. @@ -75,6 +72,7 @@ public StructuredRecord.Builder convert(Row row, Schema outputSchema) { cellValue = getCellAsBoolean(cell); break; default: + // As we only support string, double and boolean, this should never happen. throw new IllegalArgumentException( String.format("Field '%s' is of unsupported type '%s'. Supported types are: %s", field.getName(), type, "string, double, boolean"));