Skip to content

Commit

Permalink
feat: Update csv cleaner (#8828)
Browse files Browse the repository at this point in the history
* More refactoring

* Add more new options and more tests

* Improve docstrings

* Add release notes

* Fix pylint
  • Loading branch information
sjrl authored Feb 7, 2025
1 parent 1785ea6 commit 35788a2
Show file tree
Hide file tree
Showing 3 changed files with 177 additions and 35 deletions.
132 changes: 97 additions & 35 deletions haystack/components/preprocessors/csv_document_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
#
# SPDX-License-Identifier: Apache-2.0

from copy import deepcopy
from io import StringIO
from typing import Dict, List
from typing import Dict, List, Optional

from haystack import Document, component, logging
from haystack.lazy_imports import LazyImport
Expand All @@ -21,21 +22,36 @@ class CSVDocumentCleaner:
This component processes CSV content stored in Documents, allowing
for the optional ignoring of a specified number of rows and columns before performing
the cleaning operation.
the cleaning operation. Additionally, it provides options to keep document IDs and
control whether empty rows and columns should be removed.
"""

def __init__(self, ignore_rows: int = 0, ignore_columns: int = 0) -> None:
def __init__(
self,
*,
ignore_rows: int = 0,
ignore_columns: int = 0,
remove_empty_rows: bool = True,
remove_empty_columns: bool = True,
keep_id: bool = False,
) -> None:
"""
Initializes the CSVDocumentCleaner component.
:param ignore_rows: Number of rows to ignore from the top of the CSV table before processing.
:param ignore_columns: Number of columns to ignore from the left of the CSV table before processing.
:param remove_empty_rows: Whether to remove rows that are entirely empty.
:param remove_empty_columns: Whether to remove columns that are entirely empty.
:param keep_id: Whether to retain the original document ID in the output document.
Rows and columns ignored using these parameters are preserved in the final output, meaning
they are not considered when removing empty rows and columns.
"""
self.ignore_rows = ignore_rows
self.ignore_columns = ignore_columns
self.remove_empty_rows = remove_empty_rows
self.remove_empty_columns = remove_empty_columns
self.keep_id = keep_id
pandas_import.check()

@component.output_types(documents=List[Document])
Expand All @@ -44,14 +60,20 @@ def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
Cleans CSV documents by removing empty rows and columns while preserving specified ignored rows and columns.
:param documents: List of Documents containing CSV-formatted content.
:return: A dictionary with a list of cleaned Documents under the key "documents".
Processing steps:
1. Reads each document's content as a CSV table.
2. Retains the specified number of `ignore_rows` from the top and `ignore_columns` from the left.
3. Drops any rows and columns that are entirely empty (all NaN values).
3. Drops any rows and columns that are entirely empty (if enabled by `remove_empty_rows` and
`remove_empty_columns`).
4. Reattaches the ignored rows and columns to maintain their original positions.
5. Returns the cleaned CSV content as a new `Document` object.
5. Returns the cleaned CSV content as a new `Document` object, with an option to retain the original
document ID.
"""
if len(documents) == 0:
return {"documents": []}

ignore_rows = self.ignore_rows
ignore_columns = self.ignore_columns

Expand Down Expand Up @@ -82,35 +104,75 @@ def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
cleaned_documents.append(document)
continue

# Save ignored rows
ignored_rows = None
if ignore_rows > 0:
ignored_rows = df.iloc[:ignore_rows, :]

# Save ignored columns
ignored_columns = None
if ignore_columns > 0:
ignored_columns = df.iloc[:, :ignore_columns]

# Drop rows and columns that are entirely empty
remaining_df = df.iloc[ignore_rows:, ignore_columns:]
final_df = remaining_df.dropna(axis=0, how="all").dropna(axis=1, how="all")

# Reattach ignored rows
if ignore_rows > 0 and ignored_rows is not None:
# Keep only relevant columns
ignored_rows = ignored_rows.loc[:, final_df.columns]
final_df = pd.concat([ignored_rows, final_df], axis=0)

# Reattach ignored columns
if ignore_columns > 0 and ignored_columns is not None:
# Keep only relevant rows
ignored_columns = ignored_columns.loc[final_df.index, :]
final_df = pd.concat([ignored_columns, final_df], axis=1)

cleaned_documents.append(
Document(
content=final_df.to_csv(index=False, header=False, lineterminator="\n"), meta=document.meta.copy()
)
final_df = self._clean_df(df=df, ignore_rows=ignore_rows, ignore_columns=ignore_columns)

clean_doc = Document(
id=document.id if self.keep_id else "",
content=final_df.to_csv(index=False, header=False, lineterminator="\n"),
blob=document.blob,
meta=deepcopy(document.meta),
score=document.score,
embedding=document.embedding,
sparse_embedding=document.sparse_embedding,
)
cleaned_documents.append(clean_doc)
return {"documents": cleaned_documents}

def _clean_df(self, df: "pd.DataFrame", ignore_rows: int, ignore_columns: int) -> "pd.DataFrame":
"""
Cleans a DataFrame by removing empty rows and columns while preserving ignored sections.
:param df: The input DataFrame representing the CSV data.
:param ignore_rows: Number of top rows to ignore.
:param ignore_columns: Number of left columns to ignore.
"""
# Get ignored rows and columns
ignored_rows = self._get_ignored_rows(df=df, ignore_rows=ignore_rows)
ignored_columns = self._get_ignored_columns(df=df, ignore_columns=ignore_columns)
final_df = df.iloc[ignore_rows:, ignore_columns:]

# Drop rows that are entirely empty
if self.remove_empty_rows:
final_df = final_df.dropna(axis=0, how="all")

# Drop columns that are entirely empty
if self.remove_empty_columns:
final_df = final_df.dropna(axis=1, how="all")

# Reattach ignored rows
if ignore_rows > 0 and ignored_rows is not None:
# Keep only relevant columns
ignored_rows = ignored_rows.loc[:, final_df.columns]
final_df = pd.concat([ignored_rows, final_df], axis=0)

# Reattach ignored columns
if ignore_columns > 0 and ignored_columns is not None:
# Keep only relevant rows
ignored_columns = ignored_columns.loc[final_df.index, :]
final_df = pd.concat([ignored_columns, final_df], axis=1)

return final_df

@staticmethod
def _get_ignored_rows(df: "pd.DataFrame", ignore_rows: int) -> Optional["pd.DataFrame"]:
"""
Extracts the rows to be ignored from the DataFrame.
:param df: The input DataFrame.
:param ignore_rows: Number of rows to extract from the top.
"""
if ignore_rows > 0:
return df.iloc[:ignore_rows, :]
return None

@staticmethod
def _get_ignored_columns(df: "pd.DataFrame", ignore_columns: int) -> Optional["pd.DataFrame"]:
"""
Extracts the columns to be ignored from the DataFrame.
:param df: The input DataFrame.
:param ignore_columns: Number of columns to extract from the left.
"""
if ignore_columns > 0:
return df.iloc[:, :ignore_columns]
return None
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
enhancements:
- |
For the CSVDocumentCleaner, added `remove_empty_rows` & `remove_empty_columns` to optionally remove rows and columns.
Also added `keep_id` to optionally allow for keeping the original document ID.
75 changes: 75 additions & 0 deletions test/components/preprocessors/test_csv_document_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,78 @@ def test_zero_ignore_rows_and_columns() -> None:
result = csv_document_cleaner.run([csv_document])
cleaned_document = result["documents"][0]
assert cleaned_document.content == ",A,B,C\n1,item,s,\n2,item2,fd,\n"


def test_empty_document() -> None:
csv_document = Document(content="")
csv_document_cleaner = CSVDocumentCleaner()
result = csv_document_cleaner.run([csv_document])
cleaned_document = result["documents"][0]
assert cleaned_document.content == ""
assert cleaned_document.meta == {}


def test_empty_documents() -> None:
csv_document_cleaner = CSVDocumentCleaner()
result = csv_document_cleaner.run([])
assert result["documents"] == []


def test_keep_id() -> None:
csv_content = """,A,B,C
1,item,s,
"""
csv_document = Document(id="123", content=csv_content)
csv_document_cleaner = CSVDocumentCleaner(keep_id=True)
result = csv_document_cleaner.run([csv_document])
cleaned_document = result["documents"][0]
assert cleaned_document.id == "123"
assert cleaned_document.content == ",A,B,C\n1,item,s,\n"


def test_id_not_none() -> None:
csv_content = """,A,B,C
1,item,s,
"""
csv_document = Document(content=csv_content)
csv_document_cleaner = CSVDocumentCleaner()
result = csv_document_cleaner.run([csv_document])
cleaned_document = result["documents"][0]
assert cleaned_document.id != ""
assert cleaned_document.content == ",A,B,C\n1,item,s,\n"


def test_remove_empty_rows_false() -> None:
csv_content = """,B,C
,,
,5,6
"""
csv_document = Document(content=csv_content)
csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False)
result = csv_document_cleaner.run([csv_document])
cleaned_document = result["documents"][0]
assert cleaned_document.content == "B,C\n,\n5,6\n"


def test_remove_empty_columns_false() -> None:
csv_content = """,B,C
,,
,,4
"""
csv_document = Document(content=csv_content)
csv_document_cleaner = CSVDocumentCleaner(remove_empty_columns=False)
result = csv_document_cleaner.run([csv_document])
cleaned_document = result["documents"][0]
assert cleaned_document.content == ",B,C\n,,4\n"


def test_remove_empty_rows_and_columns_false() -> None:
csv_content = """,B,C
,,4
,,
"""
csv_document = Document(content=csv_content)
csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False, remove_empty_columns=False)
result = csv_document_cleaner.run([csv_document])
cleaned_document = result["documents"][0]
assert cleaned_document.content == ",B,C\n,,4\n,,\n"

0 comments on commit 35788a2

Please sign in to comment.