From e6ed78fb27d9dffaec9eebe9ed113780a2172b65 Mon Sep 17 00:00:00 2001
From: vikasrathee-cs <vikas.rathee@cloudsufi.com>
Date: Mon, 6 Nov 2023 13:49:38 +0530
Subject: [PATCH] Added format xls

---
 .../io/cdap/plugin/format/FileFormat.java     |   3 +-
 format-xls/pom.xml                            |  89 ++++++++
 .../format/xls/input/XlsInputFormat.java      | 191 ++++++++++++++++++
 .../xls/input/XlsInputFormatConfig.java       | 157 ++++++++++++++
 .../input/XlsInputFormatDataFormatter.java    |  84 ++++++++
 .../xls/input/XlsInputFormatProvider.java     | 190 +++++++++++++++++
 .../input/XlsInputFormatSchemaDetector.java   | 146 +++++++++++++
 .../format/xls/input/XlsInputFormatUtils.java |  89 ++++++++
 pom.xml                                       |   1 +
 9 files changed, 949 insertions(+), 1 deletion(-)
 create mode 100644 format-xls/pom.xml
 create mode 100644 format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormat.java
 create mode 100644 format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatConfig.java
 create mode 100644 format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatDataFormatter.java
 create mode 100644 format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatProvider.java
 create mode 100644 format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatSchemaDetector.java
 create mode 100644 format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatUtils.java
diff --git a/format-common/src/main/java/io/cdap/plugin/format/FileFormat.java b/format-common/src/main/java/io/cdap/plugin/format/FileFormat.java
index b291d34e8..620c81b18 100644
--- a/format-common/src/main/java/io/cdap/plugin/format/FileFormat.java
+++ b/format-common/src/main/java/io/cdap/plugin/format/FileFormat.java
@@ -38,7 +38,8 @@ public enum FileFormat {
   ORC(false, true),
   PARQUET(true, true),
   TEXT(true, false),
-  TSV(true, true);
+  TSV(true, true),
+  XLS(true,false);
   private final boolean canRead;
   private final boolean canWrite;
 
diff --git a/format-xls/pom.xml b/format-xls/pom.xml
new file mode 100644
index 000000000..350dee214
--- /dev/null
+++ b/format-xls/pom.xml
@@ -0,0 +1,89 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <!--
+   ~ Copyright © 2023 Cask Data, Inc.
+   ~
+   ~ Licensed under the Apache License, Version 2.0 (the "License"); you may not
+   ~ use this file except in compliance with the License. You may obtain a copy of
+   ~ the License at
+   ~
+   ~ http://www.apache.org/licenses/LICENSE-2.0
+   ~
+   ~ Unless required by applicable law or agreed to in writing, software
+   ~ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+   ~ WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+   ~ License for the specific language governing permissions and limitations under
+   ~ the License.
+   -->
+
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>io.cdap.plugin</groupId>
+    <artifactId>hydrator-plugins</artifactId>
+    <version>2.12.0-SNAPSHOT</version>
+  </parent>
+  <artifactId>format-xls</artifactId>
+  <name>XLSX format plugins</name>
+  <packaging>jar</packaging>
+
+
+  <dependencies>
+    <dependency>
+      <groupId>io.cdap.cdap</groupId>
+      <artifactId>cdap-etl-api</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>io.cdap.cdap</groupId>
+      <artifactId>cdap-formats</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>io.cdap.cdap</groupId>
+      <artifactId>hydrator-test</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>io.cdap.plugin</groupId>
+      <artifactId>format-common</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.poi</groupId>
+      <artifactId>poi</artifactId>
+      <version>5.2.4</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.poi</groupId>
+      <artifactId>poi-ooxml</artifactId>
+      <version>5.2.4</version>
+    </dependency>
+
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+    </dependency>
+  </dependencies>
+
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <configuration>
+          <instructions>
+            <_exportcontents>
+              io.cdap.plugin.format.xls.*
+            </_exportcontents>
+            <Embed-Dependency>*;inline=false;scope=compile</Embed-Dependency>
+            <Embed-Transitive>true</Embed-Transitive>
+            <Embed-Directory>lib</Embed-Directory>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>io.cdap</groupId>
+        <artifactId>cdap-maven-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
diff --git a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormat.java b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormat.java
new file mode 100644
index 000000000..566631605
--- /dev/null
+++ b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormat.java
@@ -0,0 +1,191 @@
+/*
+ * Copyright © 2023 Cask Data, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package io.cdap.plugin.format.xls.input;
+
+import com.google.common.base.Preconditions;
+import com.google.common.base.Strings;
+import io.cdap.cdap.api.data.format.StructuredRecord;
+import io.cdap.cdap.api.data.schema.Schema;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellType;
+import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.usermodel.FormulaEvaluator;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.ss.usermodel.WorkbookFactory;
+
+import java.io.IOException;
+import java.util.List;
+
+
+/**
+ * {@link XlsInputFormat} is {@link TextInputFormat} implementation for reading Excel files.
+ * <p>
+ * The {@link XlsInputFormat.XlsRecordReader} reads a given sheet, and within a sheet reads
+ * all columns and all rows.
+ */
+public class XlsInputFormat extends CombineFileInputFormat<LongWritable, StructuredRecord> {
+
+  public static final String SHEET_NO = "Sheet Number";
+  public static final String SHEET_VALUE = "sheetValue";
+  public static final String NAME_SKIP_HEADER = "skipHeader";
+  public static final String TERMINATE_IF_EMPTY_ROW = "terminateIfEmptyRow";
+
+  @Override
+  public RecordReader<LongWritable, StructuredRecord> createRecordReader(InputSplit split, TaskAttemptContext context) {
+    return new XlsRecordReader();
+  }
+
+  /**
+   * Reads excel spread sheet, where the keys are the offset in the excel file and the text is the complete record.
+   */
+  public static class XlsRecordReader extends RecordReader<LongWritable, StructuredRecord> {
+    // DataFormatter to format and get each cell's value as String
+    XlsInputFormatDataFormatter formatter;
+    FormulaEvaluator formulaEvaluator;
+    // Map key that represents the row index.
+    private LongWritable key;
+    // Map value that represents an excel row
+    private StructuredRecord value;
+    private Sheet workSheet;
+    // InputStream handler for Excel files.
+    private FSDataInputStream fileIn;
+    // Specifies the row index.
+    private int rowIndex;
+    // Specifies last row num.
+    private int lastRowNum;
+    private Schema outputSchema;
+    private boolean terminateIfEmptyRow;
+    private boolean isRowNull;
+
+    @Override
+    public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
+
+      CombineFileSplit split = (CombineFileSplit) genericSplit;
+      Configuration job = context.getConfiguration();
+      // Path of input file.
+      Path file = split.getPath(0);
+      String schema = context.getConfiguration().get("schema");
+      outputSchema = schema != null ? Schema.parseJson(schema) : null;
+      FileSystem fs = file.getFileSystem(job);
+      fileIn = fs.open(split.getPath(0));
+
+      String sheet = job.get(SHEET_NO);
+      String sheetValue = job.get(SHEET_VALUE, "0");
+      terminateIfEmptyRow = job.getBoolean(TERMINATE_IF_EMPTY_ROW, false);
+
+      try (Workbook workbook = WorkbookFactory.create(fileIn)) {
+        formulaEvaluator = workbook.getCreationHelper().createFormulaEvaluator();
+        formulaEvaluator.setIgnoreMissingWorkbooks(true);
+        formatter = new XlsInputFormatDataFormatter(formulaEvaluator);
+        // Check if user wants to access with name or number
+        if (sheet.equals(XlsInputFormatConfig.SHEET_NUMBER)) {
+          workSheet = workbook.getSheetAt(Integer.parseInt(sheetValue));
+        } else {
+          workSheet = workbook.getSheet(sheetValue);
+        }
+      } catch (Exception e) {
+        throw new IllegalArgumentException("Exception while reading excel sheet. " + e.getMessage(), e);
+      }
+
+      lastRowNum = workSheet.getLastRowNum();
+      rowIndex = 0;
+      isRowNull = false;
+
+      boolean skipFirstRow = job.getBoolean(NAME_SKIP_HEADER, true);
+      if (skipFirstRow) {
+        Preconditions.checkArgument(lastRowNum != -1, "No rows found on sheet %s", sheetValue);
+        rowIndex = 1;
+      }
+    }
+
+    @Override
+    public boolean nextKeyValue() {
+      // If any is true, then we stop processing.
+      if (rowIndex > lastRowNum || lastRowNum == -1 || (isRowNull && terminateIfEmptyRow)) {
+        return false;
+      }
+      // Get the next row.
+      Row row = workSheet.getRow(rowIndex);
+      key = new LongWritable(rowIndex);
+
+      StructuredRecord.Builder builder = StructuredRecord.builder(outputSchema);
+      List<Schema.Field> fields = outputSchema.getFields();
+
+      isRowNull = true;
+      for (int cellIndex = 0; cellIndex < row.getLastCellNum(); cellIndex++) {
+        if (cellIndex >= fields.size()) {
+         throw new IllegalArgumentException(
+           String.format("Schema contains less fields than the number of columns in the excel file. " +
+                           "Schema fields: %s, Excel columns: %s", fields.size(), row.getLastCellNum()));
+        }
+        Cell cell = row.getCell(cellIndex, Row.MissingCellPolicy.RETURN_BLANK_AS_NULL);
+        if (cell == null) {
+          continue;
+        }
+        Schema.Field field = fields.get(cellIndex);
+        Schema.Type type = field.getSchema().isNullable() ?
+                field.getSchema().getNonNullable().getType() : field.getSchema().getType();
+        String result = formatter.formatCellValue(cell, type);
+        if (result == null) {
+          continue;
+        }
+        isRowNull = false;
+        builder.convertAndSet(field.getName(), result);
+      }
+      value = builder.build();
+      rowIndex++;
+
+      // Stop processing if the row is null and terminateIfEmptyRow is true.
+      return !isRowNull || !terminateIfEmptyRow;
+    }
+
+    @Override
+    public float getProgress() {
+      return (float) rowIndex / lastRowNum;
+    }
+
+    @Override
+    public void close() throws IOException {
+      if (fileIn != null) {
+        fileIn.close();
+      }
+    }
+
+    @Override
+    public LongWritable getCurrentKey() {
+      return key;
+    }
+
+    @Override
+    public StructuredRecord getCurrentValue() {
+      return value;
+    }
+  }
+}
diff --git a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatConfig.java b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatConfig.java
new file mode 100644
index 000000000..25b1007e3
--- /dev/null
+++ b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatConfig.java
@@ -0,0 +1,157 @@
+/*
+ * Copyright © 2023 Cask Data, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package io.cdap.plugin.format.xls.input;
+
+import com.google.common.base.Strings;
+import io.cdap.cdap.api.annotation.Description;
+import io.cdap.cdap.api.annotation.Macro;
+import io.cdap.cdap.api.annotation.Name;
+import io.cdap.cdap.api.data.schema.Schema;
+import io.cdap.cdap.api.dataset.lib.KeyValue;
+import io.cdap.cdap.api.plugin.PluginPropertyField;
+import io.cdap.plugin.common.KeyValueListParser;
+import io.cdap.plugin.format.input.PathTrackingConfig;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import javax.annotation.Nullable;
+
+/**
+ * Common config for Excel related formats.
+ */
+public class XlsInputFormatConfig extends PathTrackingConfig {
+  public static final String SHEET_NUMBER = "Sheet Number";
+  private static final String NAME_OVERRIDE = "override";
+  private static final String NAME_SHEET = "sheet";
+  public static final String NAME_SHEET_VALUE = "sheetValue";
+  private static final String NAME_SKIP_HEADER = "skipHeader";
+  private static final String NAME_TERMINATE_IF_EMPTY_ROW = "terminateIfEmptyRow";
+
+  // properties
+  public static final String NAME_SAMPLE_SIZE = "sampleSize";
+
+  public static final String DESC_SKIP_HEADER =
+    "Whether to skip the first line of each file. The default value is false.";
+  public static final String DESC_SHEET = "Select the sheet by name or number. Default is 'Sheet Number'.";
+  public static final String DESC_SHEET_VALUE = "Specifies the value corresponding to 'sheet' input. " +
+    "Can be either sheet name or sheet no; for example: 'Sheet1' or '0' in case user selects 'Sheet Name' or " +
+    "'Sheet Number' as 'sheet' input respectively. Sheet number starts with 0.";
+  public static final String DESC_TERMINATE_ROW = "Specify whether processing needs to be terminated in case an" +
+    " empty row is encountered while processing excel files. Options to select are true or false.";
+  public static final Map<String, PluginPropertyField> XLS_FIELDS;
+
+  static {
+    Map<String, PluginPropertyField> fields = new HashMap<>(FIELDS);
+    fields.put(NAME_SKIP_HEADER,
+            new PluginPropertyField(NAME_SKIP_HEADER, DESC_SKIP_HEADER, "boolean", false, true));
+    // Add fields specific for excel format handling.
+    fields.put(NAME_SHEET, new PluginPropertyField(NAME_SHEET, DESC_SHEET, "string", false, true));
+    fields.put(NAME_SHEET_VALUE, new PluginPropertyField(NAME_SHEET_VALUE, DESC_SHEET_VALUE, "string", false, true));
+    fields.put(NAME_TERMINATE_IF_EMPTY_ROW, new PluginPropertyField(
+                       NAME_TERMINATE_IF_EMPTY_ROW, DESC_TERMINATE_ROW, "boolean", false, true));
+    XLS_FIELDS = Collections.unmodifiableMap(fields);
+  }
+
+  @Macro
+  @Nullable
+  @Name(NAME_SHEET)
+  @Description(DESC_SHEET)
+  private String sheet;
+
+  @Macro
+  @Nullable
+  @Name(NAME_SHEET_VALUE)
+  @Description(DESC_SHEET_VALUE)
+  private String sheetValue;
+
+
+  @Macro
+  @Nullable
+  @Name(NAME_SKIP_HEADER)
+  @Description(DESC_SKIP_HEADER)
+  private Boolean skipHeader;
+
+  @Macro
+  @Nullable
+  @Name(NAME_TERMINATE_IF_EMPTY_ROW)
+  @Description(DESC_TERMINATE_ROW)
+  private Boolean terminateIfEmptyRow;
+
+  public XlsInputFormatConfig() {
+    super();
+  }
+
+  public long getSampleSize() {
+    return Long.parseLong(getProperties().getProperties().getOrDefault(NAME_SAMPLE_SIZE, "1000"));
+  }
+
+  public String getSheet() {
+    return sheet == null ? SHEET_NUMBER : sheet;
+  }
+
+  @Nullable
+  public String getSheetValue() {
+    return sheetValue;
+  }
+
+  public boolean getSkipHeader() {
+    return skipHeader != null ? skipHeader : false;
+  }
+
+  public boolean getTerminateIfEmptyRow() {
+    return terminateIfEmptyRow != null ? terminateIfEmptyRow : false;
+  }
+
+  /**
+   * Parses a list of key-value items of column names and their corresponding data types, manually set by the user.
+   *
+   * @return A hashmap of column names and their manually set schemas.
+   */
+  public Map<String, Schema> getOverride() throws IllegalArgumentException {
+    String override = getProperties().getProperties().get(NAME_OVERRIDE);
+    Map<String, Schema> overrideDataTypes = new HashMap<>();
+    KeyValueListParser kvParser = new KeyValueListParser("\\s*,\\s*", ":");
+    if (!Strings.isNullOrEmpty(override)) {
+      for (KeyValue<String, String> keyVal : kvParser.parse(override)) {
+        String name = keyVal.getKey();
+        String stringDataType = keyVal.getValue();
+
+        Schema schema;
+        switch (stringDataType) {
+          case "date":
+            schema = Schema.of(Schema.LogicalType.DATE);
+            break;
+          case "time":
+            schema = Schema.of(Schema.LogicalType.TIME_MICROS);
+            break;
+          case "timestamp":
+            schema = Schema.of(Schema.LogicalType.TIMESTAMP_MICROS);
+            break;
+          default:
+            schema = Schema.of(Schema.Type.valueOf(stringDataType.toUpperCase()));
+        }
+
+        if (overrideDataTypes.containsKey(name)) {
+          throw new IllegalArgumentException(String.format("Cannot convert '%s' to multiple types.", name));
+        }
+        overrideDataTypes.put(name, schema);
+      }
+    }
+    return overrideDataTypes;
+  }
+}
diff --git a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatDataFormatter.java b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatDataFormatter.java
new file mode 100644
index 000000000..756430239
--- /dev/null
+++ b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatDataFormatter.java
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2023 Cask Data, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package io.cdap.plugin.format.xls.input;
+
+import io.cdap.cdap.api.data.schema.Schema;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellType;
+import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.usermodel.DateUtil;
+import org.apache.poi.ss.usermodel.FormulaEvaluator;
+
+/**
+ * Formats the cell value of an Excel file.
+ */
+public class XlsInputFormatDataFormatter {
+  private static final DataFormatter dataFormatter = new DataFormatter();
+  private final FormulaEvaluator evaluator;
+
+  /**
+   * Constructor for XlsInputFormatDataFormatter.
+   *
+   * @param evaluator the formula evaluator
+   */
+  public XlsInputFormatDataFormatter(FormulaEvaluator evaluator) {
+    this.evaluator = evaluator;
+  }
+
+  /**
+   * Formats the cell value of an Excel file.
+   *
+   * @param cell the cell to format
+   * @return the formatted cell value
+   */
+  public String formatCellValue(Cell cell, Schema.Type type) {
+    if (cell == null) {
+      return null;
+    }
+
+    CellType cellType = cell.getCellType();
+    if (cellType == CellType.FORMULA) {
+      try {
+        cellType = cell.getCachedFormulaResultType();
+      } catch (Exception e) {
+        cellType = evaluator.evaluateFormulaCell(cell);
+      }
+    }
+
+    switch (cellType) {
+      case NUMERIC:
+        if (DateUtil.isCellDateFormatted(cell)) {
+          return dataFormatter.formatCellValue(cell);
+        }
+        return Double.toString(cell.getNumericCellValue());
+      case STRING:
+        if (type == Schema.Type.DOUBLE) {
+          // Edge case when schema was inferred as double but the cell is actually a string
+          // this can be caused by an error formula cell, as the error value is stored as a string
+          return null;
+        }
+        return cell.getRichStringCellValue().getString();
+      case BOOLEAN:
+        return cell.getBooleanCellValue() ? "TRUE" : "FALSE";
+      case BLANK:
+      case ERROR:
+        return null;
+      default:
+        throw new IllegalStateException("Unexpected celltype (" + cellType + ")");
+    }
+  }
+}
diff --git a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatProvider.java b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatProvider.java
new file mode 100644
index 000000000..13eec1fc0
--- /dev/null
+++ b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatProvider.java
@@ -0,0 +1,190 @@
+/*
+ * Copyright © 2023 Cask Data, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package io.cdap.plugin.format.xls.input;
+
+import io.cdap.cdap.api.annotation.Description;
+import io.cdap.cdap.api.annotation.Name;
+import io.cdap.cdap.api.annotation.Plugin;
+import io.cdap.cdap.api.data.schema.Schema;
+import io.cdap.cdap.api.plugin.PluginClass;
+import io.cdap.cdap.etl.api.FailureCollector;
+import io.cdap.cdap.etl.api.validation.FormatContext;
+import io.cdap.cdap.etl.api.validation.InputFile;
+import io.cdap.cdap.etl.api.validation.InputFiles;
+import io.cdap.cdap.etl.api.validation.ValidatingInputFormat;
+import io.cdap.plugin.format.input.PathTrackingConfig;
+import io.cdap.plugin.format.input.PathTrackingInputFormatProvider;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.usermodel.FormulaEvaluator;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.ss.usermodel.WorkbookFactory;
+import org.apache.poi.ss.util.CellReference;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import javax.annotation.Nullable;
+
+
+/**
+ * Reads XLS(X) into StructuredRecords.
+ */
+@Plugin(type = ValidatingInputFormat.PLUGIN_TYPE)
+@Name(XlsInputFormatProvider.NAME)
+@Description(XlsInputFormatProvider.DESC)
+public class XlsInputFormatProvider extends PathTrackingInputFormatProvider<XlsInputFormatConfig> {
+  static final String NAME = "xls";
+  static final String DESC = "Plugin for reading files in xls(x) format.";
+  public static final PluginClass PLUGIN_CLASS = PluginClass.builder()
+          .setType(ValidatingInputFormat.PLUGIN_TYPE)
+          .setName(NAME)
+          .setDescription(DESC)
+          .setClassName(XlsInputFormatProvider.class.getName())
+          .setConfigFieldName("conf")
+          .setProperties(XlsInputFormatConfig.XLS_FIELDS)
+          .build();
+  private final XlsInputFormatConfig conf;
+
+  public XlsInputFormatProvider(XlsInputFormatConfig conf) {
+    super(conf);
+    this.conf = conf;
+  }
+
+  @Override
+  public String getInputFormatClassName() {
+    return XlsInputFormat.class.getName();
+  }
+
+  @Override
+  public void validate(FormatContext context) {
+    Schema schema = super.getSchema(context);
+    // When the sheet is specified by number, the sheet value must be a number
+    if (!conf.containsMacro(XlsInputFormatConfig.NAME_SHEET_VALUE)
+            && conf.getSheet().equals(XlsInputFormatConfig.SHEET_NUMBER)
+                    && (conf.getSheetValue() == null || !conf.getSheetValue().matches("[0-9]+"))) {
+      context.getFailureCollector().addFailure("Sheet number must be a number.", null)
+              .withConfigProperty(XlsInputFormatConfig.NAME_SHEET_VALUE);
+    }
+    FailureCollector collector = context.getFailureCollector();
+    if (!conf.containsMacro(PathTrackingConfig.NAME_SCHEMA) && schema == null && context.getInputSchema() == null) {
+      collector.addFailure("XLS format cannot be used without specifying a schema.", "Schema must be specified.")
+              .withConfigProperty(PathTrackingConfig.NAME_SCHEMA);
+    }
+  }
+
+  @Override
+  protected void addFormatProperties(Map<String, String> properties) {
+    properties.put(XlsInputFormat.SHEET_NO, conf.getSheet());
+    properties.put(XlsInputFormat.SHEET_VALUE, conf.getSheetValue());
+    properties.put(XlsInputFormat.NAME_SKIP_HEADER, String.valueOf(conf.getSkipHeader()));
+    properties.put(XlsInputFormat.TERMINATE_IF_EMPTY_ROW, String.valueOf(conf.getTerminateIfEmptyRow()));
+  }
+
+  @Override
+  @Nullable
+  public Schema detectSchema(FormatContext context, InputFiles inputFiles) throws IOException {
+    String blankHeader = "BLANK";
+    FailureCollector failureCollector = context.getFailureCollector();
+    FormulaEvaluator formulaEvaluator;
+    for (InputFile inputFile : inputFiles) {
+      DataFormatter formatter = new DataFormatter();
+      try (Workbook workbook = WorkbookFactory.create(inputFile.open())) {
+        formulaEvaluator = workbook.getCreationHelper().createFormulaEvaluator();
+        formulaEvaluator.setIgnoreMissingWorkbooks(true);
+        Sheet workSheet;
+        // Check if user wants to access with name or number
+        if (conf.getSheet() != null && conf.getSheet().equals(XlsInputFormatConfig.SHEET_NUMBER)) {
+          workSheet = workbook.getSheetAt(Integer.parseInt(Objects.requireNonNull(conf.getSheetValue())));
+        } else {
+          workSheet = workbook.getSheet(conf.getSheetValue());
+        }
+
+        // If provided sheet does not exist, throw an exception
+        if (workSheet == null) {
+          failureCollector.addFailure("Sheet " + conf.getSheetValue() + " does not exist in the workbook.",
+                  "Specify a valid sheet.");
+          return null;
+        }
+
+        int sampleSizeInt = getSampleSizeInt();
+        // Row numbers are 0 based in POI
+        int rowStart = Math.min(0, workSheet.getFirstRowNum());
+        int rowEnd = Math.min(sampleSizeInt, workSheet.getLastRowNum());
+
+        int lastCellNumMax = 0;
+        List<String> columnNames = new ArrayList<>();
+        XlsInputFormatSchemaDetector schemaDetector = new XlsInputFormatSchemaDetector();
+        for (int rowIndex = rowStart; rowIndex <= rowEnd; rowIndex++) {
+          Row row = workSheet.getRow(rowIndex);
+          if (row == null) {
+            continue;
+          }
+          lastCellNumMax = Math.max(lastCellNumMax, row.getLastCellNum());
+
+          // Use the first row to get the column names
+          if (rowIndex == 0 && conf.getSkipHeader()) {
+            for (int cellIndex = 0; cellIndex < lastCellNumMax; cellIndex++) {
+              Cell cell = row.getCell(cellIndex, Row.MissingCellPolicy.RETURN_BLANK_AS_NULL);
+              columnNames.add(cell == null ? blankHeader : formatter.formatCellValue(cell, formulaEvaluator));
+            }
+            // Skip Header
+            continue;
+          }
+
+          for (int cellIndex = 0; cellIndex < lastCellNumMax; cellIndex++) {
+            Cell cell = row.getCell(cellIndex, Row.MissingCellPolicy.RETURN_BLANK_AS_NULL);
+            boolean isFirstRow = rowIndex == (conf.getSkipHeader() ? 1 : 0);
+            schemaDetector.reduceSchema(cellIndex, cell, isFirstRow);
+          }
+
+        }
+
+        // If some rows have more cells than the first row, add blank headers for the extra cells
+        if (lastCellNumMax > columnNames.size() && conf.getSkipHeader()) {
+          for (int i = columnNames.size(); i < lastCellNumMax; i++) {
+            columnNames.add(blankHeader);
+          }
+        }
+
+        // Set column names if header is not skipped
+        if (!conf.getSkipHeader()) {
+          for (int i = 0; i < lastCellNumMax; i++) {
+            columnNames.add(CellReference.convertNumToColString(i));
+          }
+        }
+
+        Schema schema = Schema.recordOf("xls", schemaDetector.getFields(
+                XlsInputFormatUtils.getSafeColumnNames(columnNames), conf.getOverride()));
+        return PathTrackingInputFormatProvider.addPathField(context.getFailureCollector(), schema, conf.getPathField());
+      }
+    }
+    return null;
+  }
+
+  private int getSampleSizeInt() {
+    // Note: Sample size is long, but we are casting it to int here. This is because the POI API uses int
+    // for row numbers, so we cannot support sample sizes larger than Integer.MAX_VALUE
+    long sampleSize = conf.getSampleSize() > Integer.MAX_VALUE ? Integer.MAX_VALUE : conf.getSampleSize();
+    return (int) sampleSize;
+  }
+
+}
diff --git a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatSchemaDetector.java b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatSchemaDetector.java
new file mode 100644
index 000000000..c2f242961
--- /dev/null
+++ b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatSchemaDetector.java
@@ -0,0 +1,146 @@
+/*
+ * Copyright © 2023 Cask Data, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package io.cdap.plugin.format.xls.input;
+
+import io.cdap.cdap.api.data.schema.Schema;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellType;
+import org.apache.poi.ss.usermodel.DateUtil;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+
+/**
+ * Detects the schema of an Excel file.
+ */
+public class XlsInputFormatSchemaDetector {
+
+  private final Map<Integer, Schema.Type> columnSchemaReducerMap = new HashMap<>();
+  private final Map<Integer, Boolean> columnNullableMap = new HashMap<>();
+
+  /**
+   * Reduces the schema of the Excel file.
+   *
+   * @param columnIndex the column index of the cell
+   * @param cell the cell to reduce the schema from
+   * @param isFirstRow whether the cell is in the first row
+   */
+  public void reduceSchema(int columnIndex, Cell cell, boolean isFirstRow) {
+    boolean isCellEmpty = isCellEmpty(cell);
+
+    if (!columnNullableMap.containsKey(columnIndex)) {
+      // When we see the index for the first time and this is not the first row,
+      // we can assume that the column is nullable as the previous rows did not have a value for this column.
+      columnNullableMap.put(columnIndex, !isFirstRow);
+    }
+    // Pin the nullability of the column to true if the cell is empty
+    columnNullableMap.put(columnIndex, isCellEmpty || columnNullableMap.get(columnIndex));
+    if (isCellEmpty) {
+      return;
+    }
+    // Check if key exists in map
+    if (columnSchemaReducerMap.containsKey(columnIndex)) {
+      // If key exists, reduce the schema type
+      columnSchemaReducerMap.put(columnIndex, reduceSchemaType(columnSchemaReducerMap.get(columnIndex), cell));
+    } else {
+      // If key does not exist, add it to the map
+      columnSchemaReducerMap.put(columnIndex, getSchemaType(cell));
+    }
+  }
+
+  private void normalizeColumn(int numColumns) {
+    for (int i = 0; i < numColumns; i++) {
+      // set all nullability to true if not present
+      columnNullableMap.putIfAbsent(i, true);
+      // set all schema types to string if not present
+      columnSchemaReducerMap.putIfAbsent(i, Schema.Type.STRING);
+    }
+  }
+
+  /**
+   * Returns the schema of the Excel file.
+   *
+   * @param columnNames the column names of the Excel file
+   * @param override the override schema of the Excel file provided by the user
+   * @return the schema of the Excel file
+   */
+  public List<Schema.Field> getFields(List<String> columnNames, Map<String, Schema> override) {
+    normalizeColumn(columnNames.size());
+    List<Schema.Field> fields = new ArrayList<>();
+    for (int i = 0; i < columnNames.size(); i++) {
+      String columnName = columnNames.get(i);
+      boolean isNullable = columnNullableMap.get(i);
+      if (override.containsKey(columnName)) {
+        Schema schema = isNullable ? Schema.nullableOf(override.get(columnName)) : override.get(columnName);
+        fields.add(Schema.Field.of(columnName, schema));
+        continue;
+      }
+      Schema.Type schemaType = columnSchemaReducerMap.get(i);
+      Schema schema = isNullable ? Schema.nullableOf(Schema.of(schemaType)) : Schema.of(schemaType);
+      fields.add(Schema.Field.of(columnName, schema));
+    }
+    return fields;
+  }
+
+  private static boolean isCellEmpty(Cell cell) {
+    if (cell != null && cell.getCellType() == CellType.FORMULA) {
+      return cell.getCachedFormulaResultType() == CellType.BLANK;
+    }
+    return cell == null || cell.getCellType() == CellType.BLANK;
+  }
+
+  private static Schema.Type getSchemaType(Cell cell) {
+    CellType cellType = cell.getCellType() == CellType.FORMULA ?
+                    cell.getCachedFormulaResultType() : cell.getCellType();
+    // Force Dates As String
+    if (cellType == CellType.NUMERIC && DateUtil.isCellDateFormatted(cell)) {
+      return Schema.Type.STRING;
+    }
+    // Mapping for XLS Cell Types to CDAP Schema Types
+    switch (cellType) {
+      case BOOLEAN:
+        return Schema.Type.BOOLEAN;
+      case NUMERIC:
+        return Schema.Type.DOUBLE;
+      default:
+        return Schema.Type.STRING;
+    }
+  }
+  private static Schema.Type reduceSchemaType(Schema.Type detectedSchemaType, Cell cell) {
+    if (detectedSchemaType == Schema.Type.STRING) {
+      return Schema.Type.STRING;
+    }
+    CellType cellType = cell.getCellType() == CellType.FORMULA ?
+            cell.getCachedFormulaResultType() : cell.getCellType();
+    switch (cellType) {
+      case BOOLEAN:
+        switch (detectedSchemaType) {
+          case BOOLEAN:
+            return Schema.Type.BOOLEAN;
+          case DOUBLE:
+            return Schema.Type.DOUBLE;
+        }
+        return Schema.Type.STRING;
+      case NUMERIC:
+        return Schema.Type.DOUBLE;
+    }
+    return Schema.Type.STRING;
+  }
+}
diff --git a/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatUtils.java b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatUtils.java
new file mode 100644
index 000000000..607d07836
--- /dev/null
+++ b/format-xls/src/main/java/io/cdap/plugin/format/xls/input/XlsInputFormatUtils.java
@@ -0,0 +1,89 @@
+/*
+ * Copyright © 2023 Cask Data, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package io.cdap.plugin.format.xls.input;
+
+import io.cdap.cdap.api.data.schema.Schema;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellType;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+/**
+ * Utilities around XLS input format.
+ */
+public class XlsInputFormatUtils {
+  private static final Pattern NOT_VALID_PATTERN = Pattern.compile("[^A-Za-z0-9_]+");
+
+  /**
+   * Cleans a list of column names to make sure they comply with avro field naming standard.
+   * It also makes sure each name is unique in the list.
+   * Field names can start with [A-Za-z_] and subsequently contain only [A-Za-z0-9_].
+   * <p>
+   * Steps:
+   * 1) Trim surrounding spaces
+   * 2) If its empty replace it with BLANK
+   * 3) If it starts with a number, prepend "col_"
+   * 4) Replace invalid characters with "_" (multiple invalid characters gets replaced with one symbol)
+   * 5) Check if the name has been found before (without considering case)
+   * if so add _# where # is the number of times seen before + 1
+   */
+  public static List<String> getSafeColumnNames(List<String> columnNames) {
+    return cleanSchemaColumnNames(columnNames);
+  }
+
+  private static List<String> cleanSchemaColumnNames(List<String> columnNames) {
+    final String replacementChar = "_";
+    final List<String> cleanColumnNames = new ArrayList<>();
+    final Map<String, Integer> seenColumnNames = new HashMap<>();
+    for (String columnName : columnNames) {
+      StringBuilder cleanColumnNameBuilder = new StringBuilder();
+
+      // Remove any spaces at the end of the strings
+      columnName = columnName.trim();
+
+      // If it's an empty string replace it with BLANK
+      if (columnName.isEmpty()) {
+        cleanColumnNameBuilder.append("BLANK");
+      } else if ((columnName.charAt(0) >= '0') && (columnName.charAt(0) <= '9')) {
+        // Prepend a col_ if the first character is a number
+        cleanColumnNameBuilder.append("col_");
+      }
+
+      // Replace all invalid characters with the replacement char
+      cleanColumnNameBuilder.append(NOT_VALID_PATTERN.matcher(columnName).replaceAll(replacementChar));
+
+      // Check if the field exist if so append and index at the end
+      // We use lowercase to match columns "A" and "a" to avoid issues with wrangler.
+      String cleanColumnName = cleanColumnNameBuilder.toString();
+      String lowerCaseCleanColumnName = cleanColumnName.toLowerCase();
+      while (seenColumnNames.containsKey(lowerCaseCleanColumnName)) {
+        cleanColumnNameBuilder.append(replacementChar).append(seenColumnNames.get(lowerCaseCleanColumnName));
+        seenColumnNames.put(lowerCaseCleanColumnName, seenColumnNames.get(lowerCaseCleanColumnName) + 1);
+        cleanColumnName = cleanColumnNameBuilder.toString();
+        lowerCaseCleanColumnName = cleanColumnName.toLowerCase();
+      }
+      seenColumnNames.put(lowerCaseCleanColumnName, 2);
+
+      cleanColumnNames.add(cleanColumnName);
+    }
+    return cleanColumnNames;
+  }
+}
diff --git a/pom.xml b/pom.xml
index 63c00f4b7..b375c18fb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -45,6 +45,7 @@
     <module>solrsearch-plugins</module>
     <module>spark-plugins</module>
     <module>transform-plugins</module>
+    <module>format-xls</module>
   </modules>
 
   <licenses>