From 65d04a263ab81e0ffdc7ba75bbe466d0644e2a85 Mon Sep 17 00:00:00 2001 From: shashi gharti Date: Wed, 1 Jun 2022 13:49:12 +0545 Subject: [PATCH 01/10] Added command summary * Added command named summary * Added to_summary function to ReportTask * Added tests for both 'summary' command and 'to_summary' function --- data/fixtures/program/summary/errors.txt | 15 +++ frictionless/helpers.py | 52 ++++++++ frictionless/program/__init__.py | 1 + frictionless/program/main.py | 13 +- frictionless/program/summary.py | 119 ++++++++++++++++++ frictionless/program/validate.py | 60 +-------- frictionless/report/report.py | 80 +++++++++++- tests/program/test_summary.py | 153 +++++++++++++++++++++++ tests/report/test_reporttask.py | 81 ++++++++++++ 9 files changed, 514 insertions(+), 60 deletions(-) create mode 100644 data/fixtures/program/summary/errors.txt create mode 100644 frictionless/program/summary.py create mode 100644 tests/program/test_summary.py create mode 100644 tests/report/test_reporttask.py diff --git a/data/fixtures/program/summary/errors.txt b/data/fixtures/program/summary/errors.txt new file mode 100644 index 0000000000..c6f3486dbe --- /dev/null +++ b/data/fixtures/program/summary/errors.txt @@ -0,0 +1,15 @@ ++-------+---------+------------+----------------------------------------------------+ +| row | field | code | message | ++=======+=========+============+====================================================+ +| 4 | 5 | extra-cell | Row at position "4" has an extra value in field at | +| | | | position "5" | ++-------+---------+------------+----------------------------------------------------+ +| 7 | 2 | missing- | Row at position "7" has a missing cell in field | +| | | cell | "neighbor_id" at position "2" | ++-------+---------+------------+----------------------------------------------------+ +| 7 | 3 | missing- | Row at position "7" has a missing cell in field | +| | | cell | "name" at position "3" | ++-------+---------+------------+----------------------------------------------------+ +| 7 | 4 | missing- | Row at position "7" has a missing cell in field | +| | | cell | "population" at position "4" | ++-------+---------+------------+----------------------------------------------------+ \ No newline at end of file diff --git a/frictionless/helpers.py b/frictionless/helpers.py index 8686b11e73..d80a4ca145 100644 --- a/frictionless/helpers.py +++ b/frictionless/helpers.py @@ -634,6 +634,29 @@ def dicts_to_markdown_table(dicts: List[dict], **kwargs) -> str: return df.where(df.notnull(), None).to_markdown(index=False) +# TODO:This is a temporary function to use with tabulate as +# tabulate 0.8.9 does not support text wrap +def wrap_text_to_colwidths(list_of_lists: List, colwidths: List = [5, 5, 10, 50]) -> List: + """Create new list with wrapped text with different column width. + Args: + list_of_lists (List): List of lines + colwidths (List): width for each column + + Returns: + List: list of lines with wrapped text + + """ + result = [] + for row in list_of_lists: + new_row = [] + for cell, width in zip(row, colwidths): + cell = str(cell) + wrapped = textwrap.wrap(cell, width=width) + new_row.append("\n".join(wrapped)) + result.append(new_row) + return result + + def format_bytes(size: int) -> str: """Format bytes to larger units""" units = ["bytes", "KB", "MB", "GB", "TB"] @@ -641,3 +664,32 @@ def format_bytes(size: int) -> str: if index > len(units): index = len(units) - 1 return units[index] + + +def validation_summary( + source: str, + time_taken: str, + basepath: str = None, + rows_checked: int = None, + error_list: List = None, +) -> List: + """Generate summary for validation task""" + file_path = os.path.join(basepath, source) if basepath else source + file_size = "N/A" + unit = None + if os.path.exists(file_path): + file_size = os.path.getsize(file_path) + unit = format_bytes(file_size) + content = [ + [f"File name { '' if unit else '(Not Found)' }", source], + [f"File size { f'({unit})' if unit else '' }", file_size], + ["Total Time Taken (sec)", time_taken], + ] + if rows_checked: + content.append(["Rows Checked(Partial)**", rows_checked]) + if error_list: + content.append(["Total Errors", sum(error_list.values())]) + for code, count in error_list.items(): + content.append([code, count]) + + return content diff --git a/frictionless/program/__init__.py b/frictionless/program/__init__.py index 82594aded8..eef0a9d3f8 100644 --- a/frictionless/program/__init__.py +++ b/frictionless/program/__init__.py @@ -1,6 +1,7 @@ from .api import program_api from .describe import program_describe from .extract import program_extract +from .summary import program_summary from .main import program, program_main from .transform import program_transform from .validate import program_validate diff --git a/frictionless/program/main.py b/frictionless/program/main.py index f309434e73..7fcf1b39f7 100644 --- a/frictionless/program/main.py +++ b/frictionless/program/main.py @@ -1,3 +1,4 @@ +import sys import typer from typing import Optional from .. import settings @@ -5,7 +6,17 @@ # Program -program = typer.Typer() + +# TODO: remove this hack when Typer supports not-found commands catching +# https://github.com/tiangolo/typer/issues/18 +class Program(typer.Typer): + def __call__(self, *args, **kwargs): + if sys.argv[1].count("."): + sys.argv = [sys.argv[0], "summary", sys.argv[1]] + return super().__call__(*args, **kwargs) + + +program = Program() # Helpers diff --git a/frictionless/program/summary.py b/frictionless/program/summary.py new file mode 100644 index 0000000000..60c8a7399d --- /dev/null +++ b/frictionless/program/summary.py @@ -0,0 +1,119 @@ +import typer +from tabulate import tabulate +from .main import program +from . import common +from .. import helpers +from ..layout import Layout +from ..resource import Resource + + +@program.command(name="summary") +def program_summary(source: str = common.source): + """Summary of data source. + + It will return schema, sample of the data and validation report for the resource. + """ + # Validate input + if not source: + message = 'Providing "source" is required' + typer.secho(message, err=True, fg=typer.colors.RED, bold=True) + raise typer.Exit(1) + # Infer Resource + try: + resource = Resource(source, layout=Layout(limit_rows=5)) + resource.infer() + except Exception as exception: + typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True) + raise typer.Exit(1) + # Describe data + content = [ + [field.name, field.type, True if field.required else ""] + for field in resource.schema.fields + ] + typer.secho("") + typer.secho("# Describe ", bold=True) + typer.secho("") + typer.secho(tabulate(content, headers=["name", "type", "required"], tablefmt="grid")) + # Extract data + try: + resource.extract() + except Exception as exception: + typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True) + raise typer.Exit(1) + typer.secho("") + typer.secho("# Extract ", bold=True) + typer.secho("") + typer.secho(resource.to_view()) + # Validate data + try: + report = resource.validate() + except Exception as exception: + typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True) + raise typer.Exit(1) + error_content = [] + error_list = {} + typer.secho("") + typer.secho("# Validate ", bold=True) + typer.secho("") + for task in report.tasks: + tabular = task.resource.profile == "tabular-data-resource" + prefix = "valid" if task.valid else "invalid" + suffix = "" if tabular else "(non-tabular)" + source = task.resource.path or task.resource.name + # for zipped resources append file name + if task.resource.innerpath: + source = f"{source} => {task.resource.innerpath}" + typer.secho(f"# {'-'*len(prefix)}", bold=True) + typer.secho(f"# {prefix}: {source} {suffix}", bold=True) + typer.secho(f"# {'-'*len(prefix)}", bold=True) + for error in report.tasks[0].errors: + error_content.append( + [ + error.get("rowPosition", ""), + error.get("fieldPosition", ""), + error.code, + error.message, + ] + ) + # error list for summary + error_title = f"{error.name} ({error.code})" + if error_title not in error_list: + error_list[error_title] = 0 + error_list[error_title] += 1 + if task.partial: + last_row_checked = error.get("rowPosition", "") + error_content = helpers.wrap_text_to_colwidths(error_content) + rows_checked = last_row_checked if task.partial else None + summary_content = helpers.validation_summary( + source, + basepath=task.resource.basepath, + time_taken=task.time, + rows_checked=rows_checked, + error_list=error_list, + ) + typer.secho("") + typer.secho("## Summary ", bold=True) + typer.secho("") + typer.secho( + str( + tabulate( + summary_content, + headers=["Description", "Size/Name/Count"], + tablefmt="grid", + ) + ) + ) + if len(error_content) > 0: + typer.secho("") + typer.secho("## Errors ", bold=True) + typer.secho("") + typer.secho( + tabulate( + error_content, + headers=["row", "field", "code", "message"], + tablefmt="grid", + ) + ) + + # Return retcode + raise typer.Exit(code=int(not report.valid)) diff --git a/frictionless/program/validate.py b/frictionless/program/validate.py index e94751c55f..1f7f5252da 100644 --- a/frictionless/program/validate.py +++ b/frictionless/program/validate.py @@ -1,7 +1,5 @@ import sys -import os import typer -import textwrap from typing import List from tabulate import tabulate from ..actions import validate @@ -270,10 +268,10 @@ def program_validate( error_list[error_title] += 1 if task.partial: last_row_checked = error.get("rowPosition", "") - content = _wrap_text_to_colwidths(content) + content = helpers.wrap_text_to_colwidths(content) # summary rows_checked = last_row_checked if task.partial else None - summary_content = _validation_summary( + summary_content = helpers.validation_summary( source, basepath=task.resource.basepath, time_taken=task.time, @@ -311,57 +309,3 @@ def program_validate( # Return retcode raise typer.Exit(code=int(not report.valid)) - - -# TODO:This is a temporary function to use with tabulate as -# tabulate 0.8.9 does not support text wrap -def _wrap_text_to_colwidths( - list_of_lists: List, colwidths: List = [5, 5, 10, 50] -) -> List: - """Create new list with wrapped text with different column width. - Args: - list_of_lists (List): List of lines - colwidths (List): width for each column - - Returns: - List: list of lines with wrapped text - - """ - result = [] - for row in list_of_lists: - new_row = [] - for cell, width in zip(row, colwidths): - cell = str(cell) - wrapped = textwrap.wrap(cell, width=width) - new_row.append("\n".join(wrapped)) - result.append(new_row) - return result - - -def _validation_summary( - source: str, - time_taken: str, - basepath: str = None, - rows_checked: int = None, - error_list: List = None, -) -> List: - """Generate summary for validation task""" - file_path = os.path.join(basepath, source) if basepath else source - file_size = "N/A" - unit = None - if os.path.exists(file_path): - file_size = os.path.getsize(file_path) - unit = helpers.format_bytes(file_size) - content = [ - [f"File name { '' if unit else '(Not Found)' }", source], - [f"File size { f'({unit})' if unit else '' }", file_size], - ["Total Time Taken (sec)", time_taken], - ] - if rows_checked: - content.append(["Rows Checked(Partial)**", rows_checked]) - if error_list: - content.append(["Total Errors", sum(error_list.values())]) - for code, count in error_list.items(): - content.append([code, count]) - - return content diff --git a/frictionless/report/report.py b/frictionless/report/report.py index a38680ff0d..54c73967d9 100644 --- a/frictionless/report/report.py +++ b/frictionless/report/report.py @@ -1,6 +1,8 @@ import functools from copy import deepcopy from importlib import import_module +from tabulate import tabulate +from ..layout import Layout from ..metadata import Metadata from ..errors import Error, TaskError, ReportError from ..exception import FrictionlessException @@ -222,7 +224,7 @@ def __init__( time=None, scope=None, partial=None, - errors=None + errors=None, ): # Store provided @@ -331,6 +333,82 @@ def flatten(self, spec=["rowPosition", "fieldPosition", "code"]): result.append([context.get(prop) for prop in spec]) return result + # Summary + + def to_summary(self) -> dict: + """Summary of the resource + + Raises: + FrictionlessException: on any error + """ + # Process errors + summary = {} + error_list = {} + error_content = [] + for error in self.errors: + if error.code == "scheme-error": + return error + error_content.append( + [ + error.get("rowPosition", ""), + error.get("fieldPosition", ""), + error.code, + error.message, + ] + ) + # error list for summary + error_title = f"{error.name} ({error.code})" + if error_title not in error_list: + error_list[error_title] = 0 + error_list[error_title] += 1 + if self.partial: + last_row_checked = error.get("rowPosition", "") + # Describe + try: + self.resource.infer() + except Exception as exception: + raise FrictionlessException(self.__Error(note=str(exception))) from exception + summary["describe"] = tabulate( + [ + [field.name, field.type, True if field.required else ""] + for field in self.resource.schema.fields + ], + headers=["name", "type", "required"], + tablefmt="grid", + ) + # Extract + # Copy of existing resource to reset the properties to only extract 5 rows + resource = self.resource.to_copy(layout=Layout(limit_rows=5)) + try: + resource.extract() + except Exception as exception: + raise FrictionlessException(self.__Error(note=str(exception))) from exception + summary["extract"] = resource.to_view() + # Validate + summary["validate"] = {} + error_content = helpers.wrap_text_to_colwidths(error_content) + rows_checked = last_row_checked if self.partial else None + summary_content = helpers.validation_summary( + self.resource.path, + basepath=self.resource.basepath, + time_taken=self.time, + rows_checked=rows_checked, + error_list=error_list, + ) + summary["validate"]["summary"] = tabulate( + summary_content, + headers=["Description", "Size/Name/Count"], + tablefmt="grid", + ) + if len(error_content) > 0: + summary["validate"]["errors"] = tabulate( + error_content, + headers=["row", "field", "code", "message"], + tablefmt="grid", + ) + + return summary + # Metadata metadata_Error = ReportError diff --git a/tests/program/test_summary.py b/tests/program/test_summary.py new file mode 100644 index 0000000000..719e137eb7 --- /dev/null +++ b/tests/program/test_summary.py @@ -0,0 +1,153 @@ +import os +from typer.testing import CliRunner +from frictionless import program + +runner = CliRunner() + + +def test_program_error_not_found(): + result = runner.invoke(program, "summary data/countriess.csv") + assert result.exit_code == 1 + assert ( + result.stdout.count("[scheme-error]") + and result.stdout.count("[Errno 2]") + and result.stdout.count("data/countriess.csv") + ) + + +def test_program_summary(): + result = runner.invoke(program, "summary data/countries.csv") + assert result.exit_code == 1 + assert ( + result.stdout.count("invalid") + and result.stdout.count("Describe") + and result.stdout.count("Extract") + and result.stdout.count("Validate") + and result.stdout.count("Summary") + and result.stdout.count("Errors") + ) + + +def test_program_summary_valid(): + result = runner.invoke(program, "summary data/capital-valid.csv") + assert result.exit_code == 0 + assert ( + result.stdout.count("valid") + and result.stdout.count("Describe") + and result.stdout.count("Extract") + and result.stdout.count("Validate") + and result.stdout.count("Summary") + and not result.stdout.count("Errors") + ) + + +def test_program_summary_describe_header_row(): + result = runner.invoke(program, "summary data/countries.csv") + assert result.exit_code == 1 + assert result.stdout.count("| name | type | required |") + + +def test_program_summary_describe(): + result = runner.invoke(program, "summary data/countries.csv") + assert result.exit_code == 1 + assert ( + result.stdout.count("| id | integer | |") + and result.stdout.count("| neighbor_id | string | |") + and result.stdout.count("| name | string | |") + and result.stdout.count("| population | string | |") + ) + + +def test_program_summary_extract_header_row(): + result = runner.invoke(program, "summary data/countries.csv") + assert result.exit_code == 1 + assert result.stdout.count("| id | neighbor_id | name | population |") + + +def test_program_summary_extract(): + result = runner.invoke(program, "summary data/countries.csv") + assert result.exit_code == 1 + assert ( + result.stdout.count("| 1 | 'Ireland' | 'Britain' | '67' |") + and result.stdout.count("| 2 | '3' | 'France' | 'n/a' |") + and result.stdout.count("| 3 | '22' | 'Germany' | '83' |") + and result.stdout.count("| 4 | None | 'Italy' | '60' |") + and result.stdout.count("| 5 | None | None | None |") + ) + + +def test_program_summary_extract_only_5_rows(): + result = runner.invoke(program, "summary data/long.csv") + assert result.exit_code == 0 + assert ( + result.stdout.count("valid") + and result.stdout.count("| 1 | 'a' |") + and result.stdout.count("| 2 | 'b' |") + and result.stdout.count("| 3 | 'c' |") + and result.stdout.count("| 4 | 'd' |") + and result.stdout.count("| 5 | 'e' |") + and not result.stdout.count("| 6 | 'f' |") + ) + + +def test_program_summary_validate(): + result = runner.invoke(program, "summary data/countries.csv") + assert result.exit_code == 1 + assert result.stdout.count("# invalid:") + + +def test_program_summary_validate_summary_header_row(): + result = runner.invoke(program, "summary data/countries.csv") + assert result.exit_code == 1 + assert result.stdout.count("Description | Size/Name/Count") + + +def test_program_summary_validate_summary(): + result = runner.invoke(program, "summary data/countries.csv") + assert result.exit_code == 1 + assert ( + result.stdout.count("File name |") + and result.stdout.count("File size (bytes) | 143") + and result.stdout.count("Total Time Taken (sec) |") + and result.stdout.count("Total Errors | 4") + and result.stdout.count("Extra Cell (extra-cell) | 1") + and result.stdout.count("Missing Cell (missing-cell) | 3") + ) + + +def test_program_summary_validate_errors(): + result = runner.invoke(program, "summary data/countries.csv") + output_file_path = "data/fixtures/program/summary/errors.txt" + with open(output_file_path, encoding="utf-8") as file: + expected = file.read() + assert result.exit_code == 1 + assert result.stdout.count(expected.strip()) + + +def test_program_summary_without_command(tmpdir): + output_file_path = f"{tmpdir}/output.txt" + exit_code = os.system(f"frictionless data/countries.csv > {output_file_path}") + # A value of 256 means the spawned program terminated with exit code 1 + # https://stackoverflow.com/questions/47832180/os-system-returns-the-value-256-when-run-from-crontab + assert exit_code == 256 + with open(output_file_path, encoding="utf-8") as file: + expected = file.read() + assert ( + expected.count("Describe") + and expected.count("Extract") + and expected.count("Validate") + and expected.count("Summary") + and expected.count("Errors") + ) + + +def test_program_summary_without_filepath(): + result = runner.invoke(program, "summary") + assert result.exit_code == 1 + assert result.stdout.strip() == 'Providing "source" is required' + + +def test_program_summary_zipped_innerpath(): + result = runner.invoke(program, "summary data/table.csv.zip") + assert result.exit_code == 0 + assert result.stdout.count("table.csv.zip => table.csv") diff --git a/tests/report/test_reporttask.py b/tests/report/test_reporttask.py new file mode 100644 index 0000000000..e42cc3d5c5 --- /dev/null +++ b/tests/report/test_reporttask.py @@ -0,0 +1,81 @@ +from frictionless import validate + + +def test_report_task_error_not_found(): + report = validate("data/countriess.csv") + assert ( + report.tasks[0].errors[0].code == "scheme-error" + and report.tasks[0].errors[0].description + == "Data reading error because of incorrect scheme." + ) + + +def test_report_task_summary(): + report = validate("data/countries.csv") + assert report.valid == False and ["describe", "extract", "validate"] == list( + report.tasks[0].to_summary().keys() + ) + + +def test_report_task_summary_valid(): + report = validate("data/capital-valid.csv") + assert report.valid == True and ["summary"] == list( + report.tasks[0].to_summary()["validate"].keys() + ) + + +def test_report_task_summary_describe(): + report = validate("data/countries.csv") + output = report.tasks[0].to_summary()["describe"] + assert ( + output.count("| id | integer | |") + and output.count("| neighbor_id | string | |") + and output.count("| name | string | |") + and output.count("| population | string | |") + ) + + +def test_report_task_summary_extract(): + report = validate("data/countries.csv") + output = report.tasks[0].to_summary()["extract"] + assert ( + output.count("| 1 | 'Ireland' | 'Britain' | '67' |") + and output.count("| 2 | '3' | 'France' | 'n/a' |") + and output.count("| 3 | '22' | 'Germany' | '83' |") + and output.count("| 4 | None | 'Italy' | '60' |") + and output.count("| 5 | None | None | None |") + ) + + +def test_report_task_summary_extract_only_5_rows(): + report = validate("data/long.csv") + output = report.tasks[0].to_summary()["extract"] + assert ( + output.count("| 1 | 'a' |") + and output.count("| 2 | 'b' |") + and output.count("| 3 | 'c' |") + and output.count("| 4 | 'd' |") + and output.count("| 5 | 'e' |") + and not output.count("| 6 | 'f' |") + ) + + +def test_report_task_summary_validate_summary(): + report = validate("data/countries.csv") + validate_summary = report.tasks[0].to_summary()["validate"]["summary"] + assert ( + validate_summary.count("File name |") + and validate_summary.count("File size (bytes) | 143") + and validate_summary.count("Total Time Taken (sec) |") + and validate_summary.count("Total Errors | 4") + and validate_summary.count("Extra Cell (extra-cell) | 1") + and validate_summary.count("Missing Cell (missing-cell) | 3") + ) + + +def test_report_task_summary_validate_errors(): + report = validate("data/countries.csv") + output_file_path = "data/fixtures/program/summary/errors.txt" + with open(output_file_path, encoding="utf-8") as file: + expected = file.read() + assert report.tasks[0].to_summary()["validate"]["errors"] == expected.strip() From cde50b5661ee0fa53a44a8fa3aef7796f1e24508 Mon Sep 17 00:00:00 2001 From: shashi gharti Date: Tue, 7 Jun 2022 13:16:32 +0545 Subject: [PATCH 02/10] fixes for failing tests --- tests/report/test_reporttask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/report/test_reporttask.py b/tests/report/test_reporttask.py index e42cc3d5c5..344377236f 100644 --- a/tests/report/test_reporttask.py +++ b/tests/report/test_reporttask.py @@ -12,14 +12,14 @@ def test_report_task_error_not_found(): def test_report_task_summary(): report = validate("data/countries.csv") - assert report.valid == False and ["describe", "extract", "validate"] == list( + assert report.valid is False and ["describe", "extract", "validate"] == list( report.tasks[0].to_summary().keys() ) def test_report_task_summary_valid(): report = validate("data/capital-valid.csv") - assert report.valid == True and ["summary"] == list( + assert report.valid is True and ["summary"] == list( report.tasks[0].to_summary()["validate"].keys() ) From 25e6ee4a98a06f11b83965c6787c0637efb83bef Mon Sep 17 00:00:00 2001 From: shashi gharti Date: Tue, 7 Jun 2022 17:57:00 +0545 Subject: [PATCH 03/10] Revised code * Rearranged code and placed it in the relevant places "separation of concerns" * Made changes to tests and added new tests * Revised summary command code --- frictionless/program/summary.py | 84 +------------- frictionless/report/report.py | 146 +++++++++++------------- frictionless/schema/schema.py | 10 ++ tests/program/test_summary.py | 30 ++--- tests/report/test_report.py | 36 ++++++ tests/report/test_reporttask.py | 81 ------------- tests/resource/describe/test_general.py | 13 +++ tests/resource/validate/test_general.py | 39 +++++++ 8 files changed, 184 insertions(+), 255 deletions(-) create mode 100644 tests/report/test_report.py delete mode 100644 tests/report/test_reporttask.py diff --git a/frictionless/program/summary.py b/frictionless/program/summary.py index 60c8a7399d..5f636edd13 100644 --- a/frictionless/program/summary.py +++ b/frictionless/program/summary.py @@ -1,9 +1,6 @@ import typer -from tabulate import tabulate from .main import program from . import common -from .. import helpers -from ..layout import Layout from ..resource import Resource @@ -20,100 +17,29 @@ def program_summary(source: str = common.source): raise typer.Exit(1) # Infer Resource try: - resource = Resource(source, layout=Layout(limit_rows=5)) + resource = Resource(source) resource.infer() except Exception as exception: typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True) raise typer.Exit(1) - # Describe data - content = [ - [field.name, field.type, True if field.required else ""] - for field in resource.schema.fields - ] typer.secho("") typer.secho("# Describe ", bold=True) typer.secho("") - typer.secho(tabulate(content, headers=["name", "type", "required"], tablefmt="grid")) - # Extract data - try: - resource.extract() - except Exception as exception: - typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True) - raise typer.Exit(1) + typer.secho(str(resource.schema.to_summary())) typer.secho("") typer.secho("# Extract ", bold=True) typer.secho("") - typer.secho(resource.to_view()) - # Validate data + typer.secho(str(resource.to_view())) + # Validate try: report = resource.validate() except Exception as exception: typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True) raise typer.Exit(1) - error_content = [] - error_list = {} typer.secho("") typer.secho("# Validate ", bold=True) typer.secho("") - for task in report.tasks: - tabular = task.resource.profile == "tabular-data-resource" - prefix = "valid" if task.valid else "invalid" - suffix = "" if tabular else "(non-tabular)" - source = task.resource.path or task.resource.name - # for zipped resources append file name - if task.resource.innerpath: - source = f"{source} => {task.resource.innerpath}" - typer.secho(f"# {'-'*len(prefix)}", bold=True) - typer.secho(f"# {prefix}: {source} {suffix}", bold=True) - typer.secho(f"# {'-'*len(prefix)}", bold=True) - for error in report.tasks[0].errors: - error_content.append( - [ - error.get("rowPosition", ""), - error.get("fieldPosition", ""), - error.code, - error.message, - ] - ) - # error list for summary - error_title = f"{error.name} ({error.code})" - if error_title not in error_list: - error_list[error_title] = 0 - error_list[error_title] += 1 - if task.partial: - last_row_checked = error.get("rowPosition", "") - error_content = helpers.wrap_text_to_colwidths(error_content) - rows_checked = last_row_checked if task.partial else None - summary_content = helpers.validation_summary( - source, - basepath=task.resource.basepath, - time_taken=task.time, - rows_checked=rows_checked, - error_list=error_list, - ) - typer.secho("") - typer.secho("## Summary ", bold=True) - typer.secho("") - typer.secho( - str( - tabulate( - summary_content, - headers=["Description", "Size/Name/Count"], - tablefmt="grid", - ) - ) - ) - if len(error_content) > 0: - typer.secho("") - typer.secho("## Errors ", bold=True) - typer.secho("") - typer.secho( - tabulate( - error_content, - headers=["row", "field", "code", "message"], - tablefmt="grid", - ) - ) + typer.secho(str(report.to_summary())) # Return retcode raise typer.Exit(code=int(not report.valid)) diff --git a/frictionless/report/report.py b/frictionless/report/report.py index 54c73967d9..2131011dc3 100644 --- a/frictionless/report/report.py +++ b/frictionless/report/report.py @@ -2,7 +2,6 @@ from copy import deepcopy from importlib import import_module from tabulate import tabulate -from ..layout import Layout from ..metadata import Metadata from ..errors import Error, TaskError, ReportError from ..exception import FrictionlessException @@ -168,6 +167,75 @@ def wrapper(*args, **kwargs): return wrapper + # Summary + + def to_summary(self): + validation_content = None + for task in self.tasks: + tabular = task.resource.profile == "tabular-data-resource" + prefix = "valid" if task.valid else "invalid" + suffix = "" if tabular else "(non-tabular)" + source = task.resource.path or task.resource.name + # for zipped resources append file name + if task.resource.innerpath: + source = f"{source} => {task.resource.innerpath}" + validation_content = f"\n# {'-'*len(prefix)}\n" + validation_content += f"\n# {prefix}: {source} {suffix}\n" + validation_content += f"\n# {'-'*len(prefix)}\n" + error_list = {} + error_content = [] + for error in task.errors: + if error.code == "scheme-error": + return error + error_content.append( + [ + error.get("rowPosition", ""), + error.get("fieldPosition", ""), + error.code, + error.message, + ] + ) + # error list for summary + error_title = f"{error.name} ({error.code})" + if error_title not in error_list: + error_list[error_title] = 0 + error_list[error_title] += 1 + if task.partial: + last_row_checked = error.get("rowPosition", "") + # Validate + error_content = helpers.wrap_text_to_colwidths(error_content) + rows_checked = last_row_checked if task.partial else None + summary_content = helpers.validation_summary( + task.resource.path, + basepath=task.resource.basepath, + time_taken=self.time, + rows_checked=rows_checked, + error_list=error_list, + ) + validation_content += "\n\n" + validation_content += "## Summary " + validation_content += "\n\n" + validation_content += str( + tabulate( + summary_content, + headers=["Description", "Size/Name/Count"], + tablefmt="grid", + ) + ) + if len(error_content) > 0: + validation_content += "\n\n" + validation_content += "## Errors " + validation_content += "\n\n" + validation_content += str( + tabulate( + error_content, + headers=["row", "field", "code", "message"], + tablefmt="grid", + ) + ) + + return validation_content + # Metadata metadata_Error = ReportError @@ -333,82 +401,6 @@ def flatten(self, spec=["rowPosition", "fieldPosition", "code"]): result.append([context.get(prop) for prop in spec]) return result - # Summary - - def to_summary(self) -> dict: - """Summary of the resource - - Raises: - FrictionlessException: on any error - """ - # Process errors - summary = {} - error_list = {} - error_content = [] - for error in self.errors: - if error.code == "scheme-error": - return error - error_content.append( - [ - error.get("rowPosition", ""), - error.get("fieldPosition", ""), - error.code, - error.message, - ] - ) - # error list for summary - error_title = f"{error.name} ({error.code})" - if error_title not in error_list: - error_list[error_title] = 0 - error_list[error_title] += 1 - if self.partial: - last_row_checked = error.get("rowPosition", "") - # Describe - try: - self.resource.infer() - except Exception as exception: - raise FrictionlessException(self.__Error(note=str(exception))) from exception - summary["describe"] = tabulate( - [ - [field.name, field.type, True if field.required else ""] - for field in self.resource.schema.fields - ], - headers=["name", "type", "required"], - tablefmt="grid", - ) - # Extract - # Copy of existing resource to reset the properties to only extract 5 rows - resource = self.resource.to_copy(layout=Layout(limit_rows=5)) - try: - resource.extract() - except Exception as exception: - raise FrictionlessException(self.__Error(note=str(exception))) from exception - summary["extract"] = resource.to_view() - # Validate - summary["validate"] = {} - error_content = helpers.wrap_text_to_colwidths(error_content) - rows_checked = last_row_checked if self.partial else None - summary_content = helpers.validation_summary( - self.resource.path, - basepath=self.resource.basepath, - time_taken=self.time, - rows_checked=rows_checked, - error_list=error_list, - ) - summary["validate"]["summary"] = tabulate( - summary_content, - headers=["Description", "Size/Name/Count"], - tablefmt="grid", - ) - if len(error_content) > 0: - summary["validate"]["errors"] = tabulate( - error_content, - headers=["row", "field", "code", "message"], - tablefmt="grid", - ) - - return summary - # Metadata metadata_Error = ReportError diff --git a/frictionless/schema/schema.py b/frictionless/schema/schema.py index bf8f80ddf1..f21be9b40a 100644 --- a/frictionless/schema/schema.py +++ b/frictionless/schema/schema.py @@ -1,4 +1,5 @@ from copy import copy, deepcopy +from tabulate import tabulate from ..exception import FrictionlessException from ..metadata import Metadata from ..field import Field @@ -289,6 +290,15 @@ def to_excel_template(self, path: str) -> any: ) return tableschema_to_template.create_xlsx(self, path) + # Summary + + def to_summary(self): + content = [ + [field.name, field.type, True if field.required else ""] + for field in self.fields + ] + return tabulate(content, headers=["name", "type", "required"], tablefmt="grid") + # Metadata metadata_duplicate = True diff --git a/tests/program/test_summary.py b/tests/program/test_summary.py index 719e137eb7..e3f2c296c3 100644 --- a/tests/program/test_summary.py +++ b/tests/program/test_summary.py @@ -1,8 +1,9 @@ import os from typer.testing import CliRunner -from frictionless import program +from frictionless import program, helpers runner = CliRunner() +IS_UNIX = not helpers.is_platform("windows") def test_program_error_not_found(): @@ -41,34 +42,24 @@ def test_program_summary_valid(): ) -def test_program_summary_describe_header_row(): - result = runner.invoke(program, "summary data/countries.csv") - assert result.exit_code == 1 - assert result.stdout.count("| name | type | required |") - - def test_program_summary_describe(): result = runner.invoke(program, "summary data/countries.csv") assert result.exit_code == 1 assert ( - result.stdout.count("| id | integer | |") + result.stdout.count("| name | type | required |") + and result.stdout.count("| id | integer | |") and result.stdout.count("| neighbor_id | string | |") and result.stdout.count("| name | string | |") and result.stdout.count("| population | string | |") ) -def test_program_summary_extract_header_row(): - result = runner.invoke(program, "summary data/countries.csv") - assert result.exit_code == 1 - assert result.stdout.count("| id | neighbor_id | name | population |") - - def test_program_summary_extract(): result = runner.invoke(program, "summary data/countries.csv") assert result.exit_code == 1 assert ( - result.stdout.count("| 1 | 'Ireland' | 'Britain' | '67' |") + result.stdout.count("| id | neighbor_id | name | population |") + and result.stdout.count("| 1 | 'Ireland' | 'Britain' | '67' |") and result.stdout.count("| 2 | '3' | 'France' | 'n/a' |") and result.stdout.count("| 3 | '22' | 'Germany' | '83' |") and result.stdout.count("| 4 | None | 'Italy' | '60' |") @@ -127,9 +118,12 @@ def test_program_summary_validate_errors(): def test_program_summary_without_command(tmpdir): output_file_path = f"{tmpdir}/output.txt" exit_code = os.system(f"frictionless data/countries.csv > {output_file_path}") - # A value of 256 means the spawned program terminated with exit code 1 - # https://stackoverflow.com/questions/47832180/os-system-returns-the-value-256-when-run-from-crontab - assert exit_code == 256 + if IS_UNIX: + # A value of 256 means the spawned program terminated with exit code 1 + # https://stackoverflow.com/questions/47832180/os-system-returns-the-value-256-when-run-from-crontab + assert exit_code == 256 + else: + assert exit_code == 1 with open(output_file_path, encoding="utf-8") as file: expected = file.read() assert ( diff --git a/tests/report/test_report.py b/tests/report/test_report.py new file mode 100644 index 0000000000..c85a7d7a47 --- /dev/null +++ b/tests/report/test_report.py @@ -0,0 +1,36 @@ +from frictionless import validate + + +def test_report_summary(): + report = validate("data/countries.csv") + output = report.to_summary() + assert output.count("invalid") and output.count("Summary") and output.count("Errors") + + +def test_report_summary_validate_summary(): + report = validate("data/countries.csv") + output = report.to_summary() + assert ( + output.count("File name |") + and output.count("File size (bytes) | 143") + and output.count("Total Time Taken (sec) |") + and output.count("Total Errors | 4") + and output.count("Extra Cell (extra-cell) | 1") + and output.count("Missing Cell (missing-cell) | 3") + ) + + +def test_report_summary_validate_errors(): + report = validate("data/countries.csv") + output = report.to_summary() + with open("data/fixtures/program/summary/errors.txt", encoding="utf-8") as file: + expected = file.read() + assert output.count(expected.strip()) + + +def test_report_summary_valid(): + report = validate("data/capital-valid.csv") + output = report.to_summary() + assert ( + output.count("valid") and output.count("Summary") and not output.count("Errors") + ) diff --git a/tests/report/test_reporttask.py b/tests/report/test_reporttask.py deleted file mode 100644 index 344377236f..0000000000 --- a/tests/report/test_reporttask.py +++ /dev/null @@ -1,81 +0,0 @@ -from frictionless import validate - - -def test_report_task_error_not_found(): - report = validate("data/countriess.csv") - assert ( - report.tasks[0].errors[0].code == "scheme-error" - and report.tasks[0].errors[0].description - == "Data reading error because of incorrect scheme." - ) - - -def test_report_task_summary(): - report = validate("data/countries.csv") - assert report.valid is False and ["describe", "extract", "validate"] == list( - report.tasks[0].to_summary().keys() - ) - - -def test_report_task_summary_valid(): - report = validate("data/capital-valid.csv") - assert report.valid is True and ["summary"] == list( - report.tasks[0].to_summary()["validate"].keys() - ) - - -def test_report_task_summary_describe(): - report = validate("data/countries.csv") - output = report.tasks[0].to_summary()["describe"] - assert ( - output.count("| id | integer | |") - and output.count("| neighbor_id | string | |") - and output.count("| name | string | |") - and output.count("| population | string | |") - ) - - -def test_report_task_summary_extract(): - report = validate("data/countries.csv") - output = report.tasks[0].to_summary()["extract"] - assert ( - output.count("| 1 | 'Ireland' | 'Britain' | '67' |") - and output.count("| 2 | '3' | 'France' | 'n/a' |") - and output.count("| 3 | '22' | 'Germany' | '83' |") - and output.count("| 4 | None | 'Italy' | '60' |") - and output.count("| 5 | None | None | None |") - ) - - -def test_report_task_summary_extract_only_5_rows(): - report = validate("data/long.csv") - output = report.tasks[0].to_summary()["extract"] - assert ( - output.count("| 1 | 'a' |") - and output.count("| 2 | 'b' |") - and output.count("| 3 | 'c' |") - and output.count("| 4 | 'd' |") - and output.count("| 5 | 'e' |") - and not output.count("| 6 | 'f' |") - ) - - -def test_report_task_summary_validate_summary(): - report = validate("data/countries.csv") - validate_summary = report.tasks[0].to_summary()["validate"]["summary"] - assert ( - validate_summary.count("File name |") - and validate_summary.count("File size (bytes) | 143") - and validate_summary.count("Total Time Taken (sec) |") - and validate_summary.count("Total Errors | 4") - and validate_summary.count("Extra Cell (extra-cell) | 1") - and validate_summary.count("Missing Cell (missing-cell) | 3") - ) - - -def test_report_task_summary_validate_errors(): - report = validate("data/countries.csv") - output_file_path = "data/fixtures/program/summary/errors.txt" - with open(output_file_path, encoding="utf-8") as file: - expected = file.read() - assert report.tasks[0].to_summary()["validate"]["errors"] == expected.strip() diff --git a/tests/resource/describe/test_general.py b/tests/resource/describe/test_general.py index e95e9e17c1..c87bc6b2cd 100644 --- a/tests/resource/describe/test_general.py +++ b/tests/resource/describe/test_general.py @@ -186,3 +186,16 @@ def test_describe_resource_with_json_format_issue_827(): def test_describe_resource_with_years_in_the_header_issue_825(): resource = Resource.describe("data/issue-825.csv") assert resource.schema.field_names == ["Musei", "2011", "2010"] + + +def test_describe_resource_schema_summary(): + resource = Resource.describe("data/countries.csv") + resource.infer() + output = resource.schema.to_summary() + assert ( + output.count("| name | type | required |") + and output.count("| id | integer | |") + and output.count("| neighbor_id | string | |") + and output.count("| name | string | |") + and output.count("| population | string | |") + ) diff --git a/tests/resource/validate/test_general.py b/tests/resource/validate/test_general.py index c683eae3db..155a608929 100644 --- a/tests/resource/validate/test_general.py +++ b/tests/resource/validate/test_general.py @@ -559,3 +559,42 @@ def test_validate_resource_errors_with_fields_993(): 'The data resource has an error: "fields" should be set as "resource.schema.fields" (not "resource.fields").', ] ] + + +def test_validate_resource_summary_invalid(): + resource = Resource("data/countries.csv") + report = resource.validate() + output = report.to_summary() + assert output.count("valid") and output.count("Summary") and output.count("Errors") + + +def test_validate_resource_validate_summary(): + resource = Resource("data/countries.csv") + report = resource.validate() + output = report.to_summary() + assert ( + output.count("File name |") + and output.count("File size (bytes) | 143") + and output.count("Total Time Taken (sec) |") + and output.count("Total Errors | 4") + and output.count("Extra Cell (extra-cell) | 1") + and output.count("Missing Cell (missing-cell) | 3") + ) + + +def test_validate_resource_validate_errors(): + resource = Resource("data/countries.csv") + report = resource.validate() + output = report.to_summary() + with open("data/fixtures/program/summary/errors.txt", encoding="utf-8") as file: + expected = file.read() + assert output.count(expected.strip()) + + +def test_validate_resource_summary_valid(): + resource = Resource("data/capital-valid.csv") + report = resource.validate() + output = report.to_summary() + assert ( + output.count("valid") and output.count("Summary") and not output.count("Errors") + ) From b4fc47247cdc08dd963b1693cb372738107cc895 Mon Sep 17 00:00:00 2001 From: shashi gharti Date: Tue, 7 Jun 2022 18:31:00 +0545 Subject: [PATCH 04/10] added test for file not found error --- data/fixtures/report/scheme-error.txt | 9 +++++++++ frictionless/report/report.py | 2 -- tests/report/test_report.py | 9 +++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 data/fixtures/report/scheme-error.txt diff --git a/data/fixtures/report/scheme-error.txt b/data/fixtures/report/scheme-error.txt new file mode 100644 index 0000000000..1fe9bf7613 --- /dev/null +++ b/data/fixtures/report/scheme-error.txt @@ -0,0 +1,9 @@ +## Errors + ++-------+---------+---------+---------------------------------------------------+ +| row | field | code | message | ++=======+=========+=========+===================================================+ +| | | scheme- | The data source could not be successfully loaded: | +| | | error | [Errno 2] No such file or directory: | +| | | | 'data/countriess.csv' | ++-------+---------+---------+---------------------------------------------------+ \ No newline at end of file diff --git a/frictionless/report/report.py b/frictionless/report/report.py index 2131011dc3..b5bde0fe34 100644 --- a/frictionless/report/report.py +++ b/frictionless/report/report.py @@ -185,8 +185,6 @@ def to_summary(self): error_list = {} error_content = [] for error in task.errors: - if error.code == "scheme-error": - return error error_content.append( [ error.get("rowPosition", ""), diff --git a/tests/report/test_report.py b/tests/report/test_report.py index c85a7d7a47..850af8f378 100644 --- a/tests/report/test_report.py +++ b/tests/report/test_report.py @@ -1,6 +1,15 @@ from frictionless import validate +def test_program_error_not_found(): + report = validate("data/countriess.csv") + output = report.to_summary() + with open("data/fixtures/report/scheme-error.txt", encoding="utf-8") as file: + expected = file.read() + assert output.count(expected.strip()) + assert output.count("File name (Not Found)") + + def test_report_summary(): report = validate("data/countries.csv") output = report.to_summary() From e936c42a7b3a891569a762db8d5c1fa897bf221c Mon Sep 17 00:00:00 2001 From: shashi gharti Date: Wed, 8 Jun 2022 12:36:45 +0545 Subject: [PATCH 05/10] refactor validate command * replaced the report table display code with new to_summary function * fixed failing tests --- data/fixtures/cli/long-error-messages-976.txt | 59 +++++---- data/fixtures/cli/zipped-resources-979.txt | 70 +++++++---- frictionless/helpers.py | 52 -------- frictionless/program/validate.py | 80 +----------- frictionless/report/report.py | 118 ++++++++++++++---- tests/program/test_validate.py | 1 + 6 files changed, 174 insertions(+), 206 deletions(-) diff --git a/data/fixtures/cli/long-error-messages-976.txt b/data/fixtures/cli/long-error-messages-976.txt index 2ba67bba4b..a0d4ed6774 100644 --- a/data/fixtures/cli/long-error-messages-976.txt +++ b/data/fixtures/cli/long-error-messages-976.txt @@ -2,32 +2,39 @@ # invalid: test-tabulator # ------- +## Summary -# Summary ++---------------------------------+-------------------+ +| Description | Size/Name/Count | ++=================================+===================+ +| File name (Not Found) | test-tabulator | ++---------------------------------+-------------------+ +| File size | N/A | ++---------------------------------+-------------------+ +| Total Time Taken (sec) | | ++---------------------------------+-------------------+ +| Total Errors | 1 | ++---------------------------------+-------------------+ +| Resource Error (resource-error) | 1 | ++---------------------------------+-------------------+ -Description Size/Name/Count -------------------------------- ----------------- -File name (Not Found) test-tabulator -File size N/A -Total Time Taken (sec) -Total Errors 1 -Resource Error (resource-error) 1 +## Errors -# Errors - -row field code message ------ ------- --------- ------------------------------------------------- - resource- The data resource has an error: "{'format': - error 'inline', 'hashing': 'md5', 'name': 'test- - tabulator', 'profile': 'tabular-data-resource', - 'resources': [{'name': 'first-resource', 'path': - 'table.xls', 'schema': {'fields': [{'name': 'id', - 'type': 'number'}, {'name': 'name', 'type': - 'string'}]}}, {'name': 'number-two', 'path': - 'table-reverse.csv', 'schema': {'fields': - [{'name': 'id', 'type': 'integer'}, {'name': - 'name', 'type': 'string'}]}}], 'scheme': '', - 'stats': {'bytes': 0, 'fields': 0, 'hash': '', - 'rows': 0}} is not valid under any of the given - schemas" at "" in metadata and at "oneOf" in - profile \ No newline at end of file ++-------+---------+-----------+---------------------------------------------------+ +| row | field | code | message | ++=======+=========+===========+===================================================+ +| | | resource- | The data resource has an error: "{'format': | +| | | error | 'inline', 'hashing': 'md5', 'name': 'test- | +| | | | tabulator', 'profile': 'tabular-data-resource', | +| | | | 'resources': [{'name': 'first-resource', 'path': | +| | | | 'table.xls', 'schema': {'fields': [{'name': 'id', | +| | | | 'type': 'number'}, {'name': 'name', 'type': | +| | | | 'string'}]}}, {'name': 'number-two', 'path': | +| | | | 'table-reverse.csv', 'schema': {'fields': | +| | | | [{'name': 'id', 'type': 'integer'}, {'name': | +| | | | 'name', 'type': 'string'}]}}], 'scheme': '', | +| | | | 'stats': {'bytes': 0, 'fields': 0, 'hash': '', | +| | | | 'rows': 0}} is not valid under any of the given | +| | | | schemas" at "" in metadata and at "oneOf" in | +| | | | profile | ++-------+---------+-----------+---------------------------------------------------+ \ No newline at end of file diff --git a/data/fixtures/cli/zipped-resources-979.txt b/data/fixtures/cli/zipped-resources-979.txt index de9b7ef3b2..1783f5b1bc 100644 --- a/data/fixtures/cli/zipped-resources-979.txt +++ b/data/fixtures/cli/zipped-resources-979.txt @@ -2,43 +2,61 @@ # valid: ogd10_energieforschungstatistik_ch.csv # ----- -# Summary +## Summary + ++------------------------+----------------------------------------+ +| Description | Size/Name/Count | ++========================+========================================+ +| File name | ogd10_energieforschungstatistik_ch.csv | ++------------------------+----------------------------------------+ +| File size (KB) | 88541 | ++------------------------+----------------------------------------+ +| Total Time Taken (sec) | | ++------------------------+----------------------------------------+ + -Description Size/Name/Count ----------------------- -------------------------------------- -File name ogd10_energieforschungstatistik_ch.csv -File size (KB) 88541 -Total Time Taken (sec) # ------- # invalid: ogd10_catalogs.zip => capital-invalid.csv # ------- +## Summary -# Summary ++-----------------------------+-------------------------------------------+ +| Description | Size/Name/Count | ++=============================+===========================================+ +| File name (Not Found) | ogd10_catalogs.zip => capital-invalid.csv | ++-----------------------------+-------------------------------------------+ +| File size | N/A | ++-----------------------------+-------------------------------------------+ +| Total Time Taken (sec) | | ++-----------------------------+-------------------------------------------+ +| Total Errors | 1 | ++-----------------------------+-------------------------------------------+ +| Schema Error (schema-error) | 1 | ++-----------------------------+-------------------------------------------+ -Description Size/Name/Count ---------------------------- ----------------------------------------- -File name (Not Found) ogd10_catalogs.zip => capital-invalid.csv -File size N/A -Total Time Taken (sec) -Total Errors 1 -Schema Error (schema-error) 1 +## Errors -# Errors ++-------+---------+---------+---------------------------------------------------+ +| row | field | code | message | ++=======+=========+=========+===================================================+ +| | | schema- | Schema is not valid: Schemas with duplicate field | +| | | error | names are not supported | ++-------+---------+---------+---------------------------------------------------+ -row field code message ------ ------- ------- ------------------------------------------------- - schema- Schema is not valid: Schemas with duplicate field - error names are not supported # ----- # valid: ogd10_catalogs.zip => finanzquellen.csv # ----- -# Summary - -Description Size/Name/Count ----------------------- --------------------------------------- -File name (Not Found) ogd10_catalogs.zip => finanzquellen.csv -File size N/A -Total Time Taken (sec) +## Summary + ++------------------------+-----------------------------------------+ +| Description | Size/Name/Count | ++========================+=========================================+ +| File name (Not Found) | ogd10_catalogs.zip => finanzquellen.csv | ++------------------------+-----------------------------------------+ +| File size | N/A | ++------------------------+-----------------------------------------+ +| Total Time Taken (sec) | | ++------------------------+-----------------------------------------+ \ No newline at end of file diff --git a/frictionless/helpers.py b/frictionless/helpers.py index d80a4ca145..8686b11e73 100644 --- a/frictionless/helpers.py +++ b/frictionless/helpers.py @@ -634,29 +634,6 @@ def dicts_to_markdown_table(dicts: List[dict], **kwargs) -> str: return df.where(df.notnull(), None).to_markdown(index=False) -# TODO:This is a temporary function to use with tabulate as -# tabulate 0.8.9 does not support text wrap -def wrap_text_to_colwidths(list_of_lists: List, colwidths: List = [5, 5, 10, 50]) -> List: - """Create new list with wrapped text with different column width. - Args: - list_of_lists (List): List of lines - colwidths (List): width for each column - - Returns: - List: list of lines with wrapped text - - """ - result = [] - for row in list_of_lists: - new_row = [] - for cell, width in zip(row, colwidths): - cell = str(cell) - wrapped = textwrap.wrap(cell, width=width) - new_row.append("\n".join(wrapped)) - result.append(new_row) - return result - - def format_bytes(size: int) -> str: """Format bytes to larger units""" units = ["bytes", "KB", "MB", "GB", "TB"] @@ -664,32 +641,3 @@ def format_bytes(size: int) -> str: if index > len(units): index = len(units) - 1 return units[index] - - -def validation_summary( - source: str, - time_taken: str, - basepath: str = None, - rows_checked: int = None, - error_list: List = None, -) -> List: - """Generate summary for validation task""" - file_path = os.path.join(basepath, source) if basepath else source - file_size = "N/A" - unit = None - if os.path.exists(file_path): - file_size = os.path.getsize(file_path) - unit = format_bytes(file_size) - content = [ - [f"File name { '' if unit else '(Not Found)' }", source], - [f"File size { f'({unit})' if unit else '' }", file_size], - ["Total Time Taken (sec)", time_taken], - ] - if rows_checked: - content.append(["Rows Checked(Partial)**", rows_checked]) - if error_list: - content.append(["Total Errors", sum(error_list.values())]) - for code, count in error_list.items(): - content.append([code, count]) - - return content diff --git a/frictionless/program/validate.py b/frictionless/program/validate.py index e56b9dff8d..1265e2fd6f 100644 --- a/frictionless/program/validate.py +++ b/frictionless/program/validate.py @@ -215,7 +215,7 @@ def program_validate( typer.secho(content) raise typer.Exit() - # Return report + # Return validation report errors if report.errors: content = [] if is_stdin: @@ -230,82 +230,8 @@ def program_validate( str(tabulate(content, headers=["code", "message"], tablefmt="simple")) ) - # Return tables - prev_invalid = False - for number, task in enumerate(report.tasks, start=1): - tabular = task.resource.profile == "tabular-data-resource" - if number != 1 and prev_invalid: - typer.secho("") - prefix = "valid" if task.valid else "invalid" - suffix = "" if tabular else "(non-tabular)" - source = task.resource.path or task.resource.name - # for zipped resources append file name - if task.resource.innerpath: - source = f"{source} => {task.resource.innerpath}" - if is_stdin: - source = "stdin" - typer.secho(f"# {'-'*len(prefix)}", bold=True) - typer.secho(f"# {prefix}: {source} {suffix}", bold=True) - typer.secho(f"# {'-'*len(prefix)}", bold=True) - error_list = {} - if task.errors: - prev_invalid = True - typer.secho("") - content = [] - for error in task.errors: - content.append( - [ - error.get("rowPosition", ""), - error.get("fieldPosition", ""), - error.code, - error.message, - ] - ) - # error list for summary - error_title = f"{error.name} ({error.code})" - if error_title not in error_list: - error_list[error_title] = 0 - error_list[error_title] += 1 - if task.partial: - last_row_checked = error.get("rowPosition", "") - content = helpers.wrap_text_to_colwidths(content) - # summary - rows_checked = last_row_checked if task.partial else None - summary_content = helpers.validation_summary( - source, - basepath=task.resource.basepath, - time_taken=task.time, - rows_checked=rows_checked, - error_list=error_list, - ) - typer.echo("\n# Summary \n") - if task.partial: - typer.echo( - "The document was partially validated because of one of the limits" - ) - typer.echo("* limit errors") - typer.echo("* memory Limit \n") - typer.secho( - str( - tabulate( - summary_content, - headers=["Description", "Size/Name/Count"], - tablefmt="simple", - ) - ) - ) - # errors - if task.errors: - typer.echo("\n# Errors \n") - typer.secho( - str( - tabulate( - content, - headers=["row", "field", "code", "message"], - tablefmt="simple", - ) - ) - ) + # Return validation report summary and tables + typer.secho(str(report.to_summary())) # Return retcode raise typer.Exit(code=int(not report.valid)) diff --git a/frictionless/report/report.py b/frictionless/report/report.py index b5bde0fe34..5b148be438 100644 --- a/frictionless/report/report.py +++ b/frictionless/report/report.py @@ -1,7 +1,10 @@ +import os import functools +import textwrap from copy import deepcopy from importlib import import_module from tabulate import tabulate +from typing import List from ..metadata import Metadata from ..errors import Error, TaskError, ReportError from ..exception import FrictionlessException @@ -170,7 +173,7 @@ def wrapper(*args, **kwargs): # Summary def to_summary(self): - validation_content = None + validation_content = "" for task in self.tasks: tabular = task.resource.profile == "tabular-data-resource" prefix = "valid" if task.valid else "invalid" @@ -179,32 +182,33 @@ def to_summary(self): # for zipped resources append file name if task.resource.innerpath: source = f"{source} => {task.resource.innerpath}" - validation_content = f"\n# {'-'*len(prefix)}\n" - validation_content += f"\n# {prefix}: {source} {suffix}\n" - validation_content += f"\n# {'-'*len(prefix)}\n" + validation_content += f"\n# {'-'*len(prefix)}" + validation_content += f"\n# {prefix}: {source} {suffix}" + validation_content += f"\n# {'-'*len(prefix)}" error_list = {} error_content = [] - for error in task.errors: - error_content.append( - [ - error.get("rowPosition", ""), - error.get("fieldPosition", ""), - error.code, - error.message, - ] - ) - # error list for summary - error_title = f"{error.name} ({error.code})" - if error_title not in error_list: - error_list[error_title] = 0 - error_list[error_title] += 1 - if task.partial: - last_row_checked = error.get("rowPosition", "") + if task.errors: + for error in task.errors: + error_content.append( + [ + error.get("rowPosition", ""), + error.get("fieldPosition", ""), + error.code, + error.message, + ] + ) + # error list for summary + error_title = f"{error.name} ({error.code})" + if error_title not in error_list: + error_list[error_title] = 0 + error_list[error_title] += 1 + if task.partial: + last_row_checked = error.get("rowPosition", "") # Validate - error_content = helpers.wrap_text_to_colwidths(error_content) + error_content = _wrap_text_to_colwidths(error_content) rows_checked = last_row_checked if task.partial else None - summary_content = helpers.validation_summary( - task.resource.path, + summary_content = _validation_summary( + source, basepath=task.resource.basepath, time_taken=self.time, rows_checked=rows_checked, @@ -213,6 +217,14 @@ def to_summary(self): validation_content += "\n\n" validation_content += "## Summary " validation_content += "\n\n" + if task.partial: + validation_content += "\n\n" + validation_content += ( + "The document was partially validated because of one of the limits\n" + ) + validation_content += "* limit errors" + validation_content += "* memory Limit" + validation_content += "\n\n" validation_content += str( tabulate( summary_content, @@ -220,8 +232,9 @@ def to_summary(self): tablefmt="grid", ) ) - if len(error_content) > 0: - validation_content += "\n\n" + validation_content += "\n\n" + # errors + if task.errors: validation_content += "## Errors " validation_content += "\n\n" validation_content += str( @@ -231,6 +244,7 @@ def to_summary(self): tablefmt="grid", ) ) + validation_content += "\n\n" return validation_content @@ -412,3 +426,57 @@ def metadata_process(self): if not isinstance(resource, Resource): resource = Resource(resource) dict.__setitem__(self, "resource", resource) + + +# TODO:This is a temporary function to use with tabulate as +# tabulate 0.8.9 does not support text wrap +def _wrap_text_to_colwidths( + list_of_lists: List, colwidths: List = [5, 5, 10, 50] +) -> List: + """Create new list with wrapped text with different column width. + Args: + list_of_lists (List): List of lines + colwidths (List): width for each column + + Returns: + List: list of lines with wrapped text + + """ + result = [] + for row in list_of_lists: + new_row = [] + for cell, width in zip(row, colwidths): + cell = str(cell) + wrapped = textwrap.wrap(cell, width=width) + new_row.append("\n".join(wrapped)) + result.append(new_row) + return result + + +def _validation_summary( + source: str, + time_taken: str, + basepath: str = None, + rows_checked: int = None, + error_list: List = None, +) -> List: + """Generate summary for validation task""" + file_path = os.path.join(basepath, source) if basepath else source + file_size = "N/A" + unit = None + if os.path.exists(file_path): + file_size = os.path.getsize(file_path) + unit = helpers.format_bytes(file_size) + content = [ + [f"File name { '' if unit else '(Not Found)' }", source], + [f"File size { f'({unit})' if unit else '' }", file_size], + ["Total Time Taken (sec)", time_taken], + ] + if rows_checked: + content.append(["Rows Checked(Partial)**", rows_checked]) + if error_list: + content.append(["Total Errors", sum(error_list.values())]) + for code, count in error_list.items(): + content.append([code, count]) + + return content diff --git a/tests/program/test_validate.py b/tests/program/test_validate.py index 27eda6afee..b7bcd21e8e 100644 --- a/tests/program/test_validate.py +++ b/tests/program/test_validate.py @@ -226,6 +226,7 @@ def test_program_validate_zipped_resources_979(): assert result.stdout.count("Schema is not valid") # remove timetaken floating point number which varies output = re.sub(r"(\d+)\.(.*)\d", "", result.stdout) + print(output) assert output.strip() == expected.strip() From 4a98056d5599c21078fd371e8856731e1eb4164d Mon Sep 17 00:00:00 2001 From: shashi gharti Date: Wed, 8 Jun 2022 12:44:10 +0545 Subject: [PATCH 06/10] removed print statement --- tests/program/test_validate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/program/test_validate.py b/tests/program/test_validate.py index b7bcd21e8e..27eda6afee 100644 --- a/tests/program/test_validate.py +++ b/tests/program/test_validate.py @@ -226,7 +226,6 @@ def test_program_validate_zipped_resources_979(): assert result.stdout.count("Schema is not valid") # remove timetaken floating point number which varies output = re.sub(r"(\d+)\.(.*)\d", "", result.stdout) - print(output) assert output.strip() == expected.strip() From 6a881eceebe27e9de8912c1a7d8e2151ee67c039 Mon Sep 17 00:00:00 2001 From: shashi gharti Date: Wed, 8 Jun 2022 13:02:43 +0545 Subject: [PATCH 07/10] made validation_summary part of the report class --- frictionless/report/report.py | 66 +++++++++++++++++++---------------- frictionless/schema/schema.py | 6 ++++ 2 files changed, 42 insertions(+), 30 deletions(-) diff --git a/frictionless/report/report.py b/frictionless/report/report.py index 5b148be438..5279ce9f2e 100644 --- a/frictionless/report/report.py +++ b/frictionless/report/report.py @@ -173,6 +173,12 @@ def wrapper(*args, **kwargs): # Summary def to_summary(self): + """Summary of the report + + Returns: + str: validation report + """ + validation_content = "" for task in self.tasks: tabular = task.resource.profile == "tabular-data-resource" @@ -207,7 +213,7 @@ def to_summary(self): # Validate error_content = _wrap_text_to_colwidths(error_content) rows_checked = last_row_checked if task.partial else None - summary_content = _validation_summary( + summary_content = self.validation_summary( source, basepath=task.resource.basepath, time_taken=self.time, @@ -248,6 +254,35 @@ def to_summary(self): return validation_content + def validation_summary( + self, + source: str, + time_taken: str, + basepath: str = None, + rows_checked: int = None, + error_list: List = None, + ) -> List: + """Generate summary for validation task""" + file_path = os.path.join(basepath, source) if basepath else source + file_size = "N/A" + unit = None + if os.path.exists(file_path): + file_size = os.path.getsize(file_path) + unit = helpers.format_bytes(file_size) + content = [ + [f"File name { '' if unit else '(Not Found)' }", source], + [f"File size { f'({unit})' if unit else '' }", file_size], + ["Total Time Taken (sec)", time_taken], + ] + if rows_checked: + content.append(["Rows Checked(Partial)**", rows_checked]) + if error_list: + content.append(["Total Errors", sum(error_list.values())]) + for code, count in error_list.items(): + content.append([code, count]) + + return content + # Metadata metadata_Error = ReportError @@ -451,32 +486,3 @@ def _wrap_text_to_colwidths( new_row.append("\n".join(wrapped)) result.append(new_row) return result - - -def _validation_summary( - source: str, - time_taken: str, - basepath: str = None, - rows_checked: int = None, - error_list: List = None, -) -> List: - """Generate summary for validation task""" - file_path = os.path.join(basepath, source) if basepath else source - file_size = "N/A" - unit = None - if os.path.exists(file_path): - file_size = os.path.getsize(file_path) - unit = helpers.format_bytes(file_size) - content = [ - [f"File name { '' if unit else '(Not Found)' }", source], - [f"File size { f'({unit})' if unit else '' }", file_size], - ["Total Time Taken (sec)", time_taken], - ] - if rows_checked: - content.append(["Rows Checked(Partial)**", rows_checked]) - if error_list: - content.append(["Total Errors", sum(error_list.values())]) - for code, count in error_list.items(): - content.append([code, count]) - - return content diff --git a/frictionless/schema/schema.py b/frictionless/schema/schema.py index f21be9b40a..8113ee1854 100644 --- a/frictionless/schema/schema.py +++ b/frictionless/schema/schema.py @@ -293,6 +293,12 @@ def to_excel_template(self, path: str) -> any: # Summary def to_summary(self): + """Summary of the schema in table format + + Returns: + str: schema summary + """ + content = [ [field.name, field.type, True if field.required else ""] for field in self.fields From b6dd72acca4f9c9382d93dc2e07d910f23bd0dfe Mon Sep 17 00:00:00 2001 From: shashi gharti Date: Wed, 8 Jun 2022 13:29:56 +0545 Subject: [PATCH 08/10] fixes to failing test * simplified test output for multiline text --- data/fixtures/cli/zipped-resources-979.txt | 56 +--------------------- tests/program/test_validate.py | 13 +++-- 2 files changed, 7 insertions(+), 62 deletions(-) diff --git a/data/fixtures/cli/zipped-resources-979.txt b/data/fixtures/cli/zipped-resources-979.txt index 1783f5b1bc..dff7178988 100644 --- a/data/fixtures/cli/zipped-resources-979.txt +++ b/data/fixtures/cli/zipped-resources-979.txt @@ -1,40 +1,3 @@ -# ----- -# valid: ogd10_energieforschungstatistik_ch.csv -# ----- - -## Summary - -+------------------------+----------------------------------------+ -| Description | Size/Name/Count | -+========================+========================================+ -| File name | ogd10_energieforschungstatistik_ch.csv | -+------------------------+----------------------------------------+ -| File size (KB) | 88541 | -+------------------------+----------------------------------------+ -| Total Time Taken (sec) | | -+------------------------+----------------------------------------+ - - -# ------- -# invalid: ogd10_catalogs.zip => capital-invalid.csv -# ------- - -## Summary - -+-----------------------------+-------------------------------------------+ -| Description | Size/Name/Count | -+=============================+===========================================+ -| File name (Not Found) | ogd10_catalogs.zip => capital-invalid.csv | -+-----------------------------+-------------------------------------------+ -| File size | N/A | -+-----------------------------+-------------------------------------------+ -| Total Time Taken (sec) | | -+-----------------------------+-------------------------------------------+ -| Total Errors | 1 | -+-----------------------------+-------------------------------------------+ -| Schema Error (schema-error) | 1 | -+-----------------------------+-------------------------------------------+ - ## Errors +-------+---------+---------+---------------------------------------------------+ @@ -42,21 +5,4 @@ +=======+=========+=========+===================================================+ | | | schema- | Schema is not valid: Schemas with duplicate field | | | | error | names are not supported | -+-------+---------+---------+---------------------------------------------------+ - - -# ----- -# valid: ogd10_catalogs.zip => finanzquellen.csv -# ----- - -## Summary - -+------------------------+-----------------------------------------+ -| Description | Size/Name/Count | -+========================+=========================================+ -| File name (Not Found) | ogd10_catalogs.zip => finanzquellen.csv | -+------------------------+-----------------------------------------+ -| File size | N/A | -+------------------------+-----------------------------------------+ -| Total Time Taken (sec) | | -+------------------------+-----------------------------------------+ \ No newline at end of file ++-------+---------+---------+---------------------------------------------------+ \ No newline at end of file diff --git a/tests/program/test_validate.py b/tests/program/test_validate.py index 27eda6afee..f4355c6411 100644 --- a/tests/program/test_validate.py +++ b/tests/program/test_validate.py @@ -220,13 +220,12 @@ def test_program_validate_zipped_resources_979(): with open(output_file_path, encoding="utf-8") as file: expected = file.read() assert result.exit_code == 1 - assert result.stdout.count("valid: ogd10_energieforschungstatistik_ch.csv") - assert result.stdout.count("valid: ogd10_catalogs.zip => finanzquellen.csv") - assert result.stdout.count("invalid: ogd10_catalogs.zip => capital-invalid.csv") - assert result.stdout.count("Schema is not valid") - # remove timetaken floating point number which varies - output = re.sub(r"(\d+)\.(.*)\d", "", result.stdout) - assert output.strip() == expected.strip() + assert ( + result.stdout.count("valid: ogd10_energieforschungstatistik_ch.csv") + and result.stdout.count("valid: ogd10_catalogs.zip => finanzquellen.csv") + and result.stdout.count("invalid: ogd10_catalogs.zip => capital-invalid.csv") + and result.stdout.count(expected.strip()) + ) def test_program_validate_long_error_messages_976(): From 16a05cf26f70bf7a263463a8aacf311591fa0771 Mon Sep 17 00:00:00 2001 From: shashi gharti Date: Thu, 9 Jun 2022 17:19:25 +0545 Subject: [PATCH 09/10] * Refactored display summary code in report.py class/file. Added validation_summary as to_summary to report task class * Changed the code of report.to_summary to use reporttask summary function * Removed unrelated tests code added in previous commit and added new ones for schema, resource, reporttask and report * Organized tests files for summary feature --- data/fixtures/cli/long-error-messages-976.txt | 20 ---- .../multiline-errors.txt} | 0 .../multiline-scheme-error.txt} | 0 frictionless/program/summary.py | 1 - frictionless/report/report.py | 113 +++++++++--------- tests/program/test_summary.py | 13 +- tests/program/test_validate.py | 4 +- tests/report/test_report.py | 60 +++++++--- tests/report/test_reporttask.py | 73 +++++++++++ tests/resource/test_general.py | 26 ++++ tests/resource/validate/test_general.py | 39 ------ tests/schema/test_general.py | 66 ++++++++++ 12 files changed, 274 insertions(+), 141 deletions(-) rename data/fixtures/{program/summary/errors.txt => summary/multiline-errors.txt} (100%) rename data/fixtures/{report/scheme-error.txt => summary/multiline-scheme-error.txt} (100%) create mode 100644 tests/report/test_reporttask.py diff --git a/data/fixtures/cli/long-error-messages-976.txt b/data/fixtures/cli/long-error-messages-976.txt index a0d4ed6774..7cddfd1171 100644 --- a/data/fixtures/cli/long-error-messages-976.txt +++ b/data/fixtures/cli/long-error-messages-976.txt @@ -1,23 +1,3 @@ -# ------- -# invalid: test-tabulator -# ------- - -## Summary - -+---------------------------------+-------------------+ -| Description | Size/Name/Count | -+=================================+===================+ -| File name (Not Found) | test-tabulator | -+---------------------------------+-------------------+ -| File size | N/A | -+---------------------------------+-------------------+ -| Total Time Taken (sec) | | -+---------------------------------+-------------------+ -| Total Errors | 1 | -+---------------------------------+-------------------+ -| Resource Error (resource-error) | 1 | -+---------------------------------+-------------------+ - ## Errors +-------+---------+-----------+---------------------------------------------------+ diff --git a/data/fixtures/program/summary/errors.txt b/data/fixtures/summary/multiline-errors.txt similarity index 100% rename from data/fixtures/program/summary/errors.txt rename to data/fixtures/summary/multiline-errors.txt diff --git a/data/fixtures/report/scheme-error.txt b/data/fixtures/summary/multiline-scheme-error.txt similarity index 100% rename from data/fixtures/report/scheme-error.txt rename to data/fixtures/summary/multiline-scheme-error.txt diff --git a/frictionless/program/summary.py b/frictionless/program/summary.py index 5f636edd13..47f3cd41b3 100644 --- a/frictionless/program/summary.py +++ b/frictionless/program/summary.py @@ -38,7 +38,6 @@ def program_summary(source: str = common.source): raise typer.Exit(1) typer.secho("") typer.secho("# Validate ", bold=True) - typer.secho("") typer.secho(str(report.to_summary())) # Return retcode diff --git a/frictionless/report/report.py b/frictionless/report/report.py index 5279ce9f2e..e1d8017067 100644 --- a/frictionless/report/report.py +++ b/frictionless/report/report.py @@ -191,7 +191,6 @@ def to_summary(self): validation_content += f"\n# {'-'*len(prefix)}" validation_content += f"\n# {prefix}: {source} {suffix}" validation_content += f"\n# {'-'*len(prefix)}" - error_list = {} error_content = [] if task.errors: for error in task.errors: @@ -203,41 +202,19 @@ def to_summary(self): error.message, ] ) - # error list for summary - error_title = f"{error.name} ({error.code})" - if error_title not in error_list: - error_list[error_title] = 0 - error_list[error_title] += 1 - if task.partial: - last_row_checked = error.get("rowPosition", "") # Validate error_content = _wrap_text_to_colwidths(error_content) - rows_checked = last_row_checked if task.partial else None - summary_content = self.validation_summary( - source, - basepath=task.resource.basepath, - time_taken=self.time, - rows_checked=rows_checked, - error_list=error_list, - ) validation_content += "\n\n" validation_content += "## Summary " validation_content += "\n\n" if task.partial: - validation_content += "\n\n" validation_content += ( "The document was partially validated because of one of the limits\n" ) - validation_content += "* limit errors" + validation_content += "* limit errors \n" validation_content += "* memory Limit" validation_content += "\n\n" - validation_content += str( - tabulate( - summary_content, - headers=["Description", "Size/Name/Count"], - tablefmt="grid", - ) - ) + validation_content += task.to_summary() validation_content += "\n\n" # errors if task.errors: @@ -254,35 +231,6 @@ def to_summary(self): return validation_content - def validation_summary( - self, - source: str, - time_taken: str, - basepath: str = None, - rows_checked: int = None, - error_list: List = None, - ) -> List: - """Generate summary for validation task""" - file_path = os.path.join(basepath, source) if basepath else source - file_size = "N/A" - unit = None - if os.path.exists(file_path): - file_size = os.path.getsize(file_path) - unit = helpers.format_bytes(file_size) - content = [ - [f"File name { '' if unit else '(Not Found)' }", source], - [f"File size { f'({unit})' if unit else '' }", file_size], - ["Total Time Taken (sec)", time_taken], - ] - if rows_checked: - content.append(["Rows Checked(Partial)**", rows_checked]) - if error_list: - content.append(["Total Errors", sum(error_list.values())]) - for code, count in error_list.items(): - content.append([code, count]) - - return content - # Metadata metadata_Error = ReportError @@ -448,6 +396,63 @@ def flatten(self, spec=["rowPosition", "fieldPosition", "code"]): result.append([context.get(prop) for prop in spec]) return result + # Summary + + def to_summary( + self, + ) -> str: + """Generate summary for validation task" + + Returns: + str: validation summary + """ + source = self.resource.path or self.resource.name + # For zipped resources append file name + if self.resource.innerpath: + source = f"{source} => {self.resource.innerpath}" + file_path = ( + os.path.join(self.resource.basepath, source) + if self.resource.basepath + else source + ) + # Prepare error lists and last row checked(in case of partial validation) + error_list = {} + for error in self.errors: + error_title = f"{error.name} ({error.code})" + if error_title not in error_list: + error_list[error_title] = 0 + error_list[error_title] += 1 + if self.partial: + last_row_checked = error.get("rowPosition", "") + rows_checked = last_row_checked if self.partial else None + file_size = "N/A" + unit = None + if os.path.exists(file_path): + file_size = os.path.getsize(file_path) + unit = helpers.format_bytes(file_size) + not_found_text = "" + if not unit: + if not self.resource.innerpath: + not_found_text = "(Not Found)" + content = [ + [f"File name {not_found_text}", source], + [f"File size { f'({unit})' if unit else '' }", file_size], + ["Total Time Taken (sec)", self.time], + ] + if rows_checked: + content.append(["Rows Checked(Partial)**", rows_checked]) + if error_list: + content.append(["Total Errors", sum(error_list.values())]) + for code, count in error_list.items(): + content.append([code, count]) + return str( + tabulate( + content, + headers=["Description", "Size/Name/Count"], + tablefmt="grid", + ) + ) + # Metadata metadata_Error = ReportError diff --git a/tests/program/test_summary.py b/tests/program/test_summary.py index e3f2c296c3..8ed24a0cb9 100644 --- a/tests/program/test_summary.py +++ b/tests/program/test_summary.py @@ -6,7 +6,7 @@ IS_UNIX = not helpers.is_platform("windows") -def test_program_error_not_found(): +def test_program_summary_error_not_found(): result = runner.invoke(program, "summary data/countriess.csv") assert result.exit_code == 1 assert ( @@ -87,17 +87,12 @@ def test_program_summary_validate(): assert result.stdout.count("# invalid:") -def test_program_summary_validate_summary_header_row(): - result = runner.invoke(program, "summary data/countries.csv") - assert result.exit_code == 1 - assert result.stdout.count("Description | Size/Name/Count") - - def test_program_summary_validate_summary(): result = runner.invoke(program, "summary data/countries.csv") assert result.exit_code == 1 assert ( - result.stdout.count("File name |") + result.stdout.count("Description | Size/Name/Count") + and result.stdout.count("File name | data/countries.csv") and result.stdout.count("File size (bytes) | 143") and result.stdout.count("Total Time Taken (sec) |") and result.stdout.count("Total Errors | 4") @@ -108,7 +103,7 @@ def test_program_summary_validate_summary(): def test_program_summary_validate_errors(): result = runner.invoke(program, "summary data/countries.csv") - output_file_path = "data/fixtures/program/summary/errors.txt" + output_file_path = "data/fixtures/summary/multiline-errors.txt" with open(output_file_path, encoding="utf-8") as file: expected = file.read() assert result.exit_code == 1 diff --git a/tests/program/test_validate.py b/tests/program/test_validate.py index f4355c6411..00480368f9 100644 --- a/tests/program/test_validate.py +++ b/tests/program/test_validate.py @@ -1,6 +1,5 @@ import json import yaml -import re from typer.testing import CliRunner from frictionless import Metadata, Detector, program, validate @@ -233,9 +232,8 @@ def test_program_validate_long_error_messages_976(): output_file_path = "data/fixtures/cli/long-error-messages-976.txt" with open(output_file_path, encoding="utf-8") as file: expected = file.read() - output = re.sub(r"(\d+)\.(.*)\d", "", result.stdout) assert result.exit_code == 1 - assert output.strip() == expected.strip() + assert result.stdout.count(expected.strip()) def test_program_validate_partial_validation_info_933(): diff --git a/tests/report/test_report.py b/tests/report/test_report.py index 850af8f378..8616bfad74 100644 --- a/tests/report/test_report.py +++ b/tests/report/test_report.py @@ -4,42 +4,72 @@ def test_program_error_not_found(): report = validate("data/countriess.csv") output = report.to_summary() - with open("data/fixtures/report/scheme-error.txt", encoding="utf-8") as file: + with open( + "data/fixtures/summary/multiline-scheme-error.txt", encoding="utf-8" + ) as file: expected = file.read() assert output.count(expected.strip()) assert output.count("File name (Not Found)") -def test_report_summary(): +def test_report_summary_valid(): + report = validate("data/capital-valid.csv") + output = report.to_summary() + assert ( + output.count("valid") and output.count("Summary") and not output.count("Errors") + ) + + +def test_report_summary_invalid(): report = validate("data/countries.csv") output = report.to_summary() assert output.count("invalid") and output.count("Summary") and output.count("Errors") -def test_report_summary_validate_summary(): - report = validate("data/countries.csv") +def test_report_summary_validate_summary_valid(): + report = validate("data/capital-valid.csv") output = report.to_summary() assert ( - output.count("File name |") - and output.count("File size (bytes) | 143") - and output.count("Total Time Taken (sec) |") - and output.count("Total Errors | 4") - and output.count("Extra Cell (extra-cell) | 1") - and output.count("Missing Cell (missing-cell) | 3") + output.count("valid") + and output.count("Summary") + and output.count("File name | data/capital-valid.csv") + and output.count("File size (bytes) | 50 ") + and output.count("Total Time Taken (sec) | ") ) -def test_report_summary_validate_errors(): +def test_report_summary_validate_summary_invalid(): + report = validate("data/capital-invalid.csv") + output = report.to_summary() + assert ( + output.count("invalid") + and output.count("Summary") + and output.count("File name | data/capital-invalid.csv") + and output.count("File size (bytes) | 171 ") + and output.count("Total Time Taken (sec) |") + and output.count("Total Errors | 5 ") + and output.count("Duplicate Label (duplicate-label) | 1 ") + and output.count("Missing Cell (missing-cell) | 1 ") + and output.count("Blank Row (blank-row) | 1 ") + and output.count("Type Error (type-error) | 1 ") + and output.count("Extra Cell (extra-cell) | 1 ") + ) + + +def test_report_summary_validate_multiline_errors(): report = validate("data/countries.csv") output = report.to_summary() - with open("data/fixtures/program/summary/errors.txt", encoding="utf-8") as file: + with open("data/fixtures/summary/multiline-errors.txt", encoding="utf-8") as file: expected = file.read() assert output.count(expected.strip()) -def test_report_summary_valid(): - report = validate("data/capital-valid.csv") +def test_report_summary_partial_validation(): + report = validate("data/capital-invalid.csv", limit_errors=2) output = report.to_summary() assert ( - output.count("valid") and output.count("Summary") and not output.count("Errors") + output.count("The document was partially validated because of one of the limits") + and output.count("limit errors") + and output.count("memory Limit") + and output.count("Rows Checked(Partial)** | 10") ) diff --git a/tests/report/test_reporttask.py b/tests/report/test_reporttask.py new file mode 100644 index 0000000000..a3558a3623 --- /dev/null +++ b/tests/report/test_reporttask.py @@ -0,0 +1,73 @@ +from frictionless import validate + + +def test_report_reporttask_summary_valid(): + report = validate("data/capital-valid.csv") + output = report.tasks[0].to_summary() + assert ( + output.count("File name | data/capital-valid.csv") + and output.count("File size (bytes) | 50 ") + and output.count("Total Time Taken (sec) | ") + ) + + +def test_report_reporttask_summary_invalid(): + report = validate("data/capital-invalid.csv") + output = report.tasks[0].to_summary() + assert ( + output.count("File name | data/capital-invalid.csv") + and output.count("File size (bytes) | 171 ") + and output.count("Total Time Taken (sec) |") + and output.count("Total Errors | 5 ") + and output.count("Duplicate Label (duplicate-label) | 1 ") + and output.count("Missing Cell (missing-cell) | 1 ") + and output.count("Blank Row (blank-row) | 1 ") + and output.count("Type Error (type-error) | 1 ") + and output.count("Extra Cell (extra-cell) | 1 ") + ) + + +def test_report_reporttask_summary_filenotfound(): + report = validate("data/capital-invalids.csv") + output = report.tasks[0].to_summary() + assert ( + output.count("File name (Not Found) | data/capital-invalids.csv") + and output.count("File size | N/A") + and output.count("Total Time Taken (sec) ") + and output.count("Total Errors | 1") + and output.count("Scheme Error (scheme-error) | 1") + ) + + +def test_report_reporttask_summary_zippedfile(): + report = validate("data/table.csv.zip") + output = report.tasks[0].to_summary() + assert ( + output.count("File name | data/table.csv.zip => table.csv") + and output.count("File size | N/A") + and output.count("Total Time Taken (sec) |") + ) + + +def test_report_reporttask_summary_lastrowchecked(): + report = validate("data/capital-invalid.csv", limit_errors=2) + output = report.tasks[0].to_summary() + assert ( + output.count("Rows Checked(Partial)** | 10") + and output.count("Total Errors | 2") + and output.count("Duplicate Label (duplicate-label) | 1") + and output.count("Missing Cell (missing-cell) | 1") + ) + + +def test_report_reporttask_summary_errors_with_count(): + report = validate("data/capital-invalid.csv") + output = report.tasks[0].to_summary() + assert ( + output.count("Total Errors | 5 ") + and output.count("Duplicate Label (duplicate-label) | 1 ") + and output.count("Missing Cell (missing-cell) | 1 ") + and output.count("Blank Row (blank-row) | 1 ") + and output.count("Type Error (type-error) | 1 ") + and output.count("Extra Cell (extra-cell) | 1 ") + ) diff --git a/tests/resource/test_general.py b/tests/resource/test_general.py index 365b22fa84..4229cd226c 100644 --- a/tests/resource/test_general.py +++ b/tests/resource/test_general.py @@ -549,3 +549,29 @@ def test_resource_pprint_1029(): 'path': 'data/table.csv', 'title': 'My Resource'}""" assert repr(resource) == expected + + +def test_resource_summary_valid_resource(): + resource = Resource("data/capital-valid.csv") + output = resource.to_view() + assert ( + output.count("| id | name |") + and output.count("| 1 | 'London' |") + and output.count("| 2 | 'Berlin' |") + and output.count("| 3 | 'Paris' |") + and output.count("| 4 | 'Madrid' |") + and output.count("| 5 | 'Rome' |") + ) + + +def test_resource_summary_invalid_resource(): + resource = Resource("data/countries.csv") + output = resource.to_view() + assert ( + output.count("| id | neighbor_id | name | population |") + and output.count("| 1 | 'Ireland' | 'Britain' | '67' |") + and output.count("| 2 | '3' | 'France' | 'n/a' |") + and output.count("| 3 | '22' | 'Germany' | '83' |") + and output.count("| 4 | None | 'Italy' | '60' |") + and output.count("| 5 | None | None | None |") + ) diff --git a/tests/resource/validate/test_general.py b/tests/resource/validate/test_general.py index 155a608929..c683eae3db 100644 --- a/tests/resource/validate/test_general.py +++ b/tests/resource/validate/test_general.py @@ -559,42 +559,3 @@ def test_validate_resource_errors_with_fields_993(): 'The data resource has an error: "fields" should be set as "resource.schema.fields" (not "resource.fields").', ] ] - - -def test_validate_resource_summary_invalid(): - resource = Resource("data/countries.csv") - report = resource.validate() - output = report.to_summary() - assert output.count("valid") and output.count("Summary") and output.count("Errors") - - -def test_validate_resource_validate_summary(): - resource = Resource("data/countries.csv") - report = resource.validate() - output = report.to_summary() - assert ( - output.count("File name |") - and output.count("File size (bytes) | 143") - and output.count("Total Time Taken (sec) |") - and output.count("Total Errors | 4") - and output.count("Extra Cell (extra-cell) | 1") - and output.count("Missing Cell (missing-cell) | 3") - ) - - -def test_validate_resource_validate_errors(): - resource = Resource("data/countries.csv") - report = resource.validate() - output = report.to_summary() - with open("data/fixtures/program/summary/errors.txt", encoding="utf-8") as file: - expected = file.read() - assert output.count(expected.strip()) - - -def test_validate_resource_summary_valid(): - resource = Resource("data/capital-valid.csv") - report = resource.validate() - output = report.to_summary() - assert ( - output.count("valid") and output.count("Summary") and not output.count("Errors") - ) diff --git a/tests/schema/test_general.py b/tests/schema/test_general.py index d68d4a87f7..242289a0d9 100644 --- a/tests/schema/test_general.py +++ b/tests/schema/test_general.py @@ -337,3 +337,69 @@ def test_schema_not_supported_type_issue_goodatbles_304(): schema = Schema({"fields": [{"name": "name"}, {"name": "age", "type": "bad"}]}) assert schema.metadata_valid is False assert schema.fields[1] == {"name": "age", "type": "bad"} + + +def test_schema_summary(): + schema = Schema(DESCRIPTOR_MAX) + output = schema.to_summary() + assert ( + output.count("| name | type | required |") + and output.count("| id | string | True |") + and output.count("| height | number | |") + and output.count("| age | integer | |") + and output.count("| name | string | |") + ) + + +def test_schema_summary_without_required(): + descriptor = { + "fields": [ + {"name": "test_1", "type": "string", "format": "default"}, + {"name": "test_2", "type": "string", "format": "default"}, + {"name": "test_3", "type": "string", "format": "default"}, + ] + } + schema = Schema(descriptor) + output = schema.to_summary() + assert ( + output.count("| name | type | required |") + and output.count("| test_1 | string | |") + and output.count("| test_2 | string | |") + and output.count("| test_3 | string | |") + ) + + +def test_schema_summary_without_type_missing_for_some_fields(): + descriptor = { + "fields": [ + {"name": "id", "format": "default"}, + {"name": "name", "type": "string", "format": "default"}, + {"name": "age", "format": "default"}, + ] + } + schema = Schema(descriptor) + output = schema.to_summary() + assert ( + output.count("| name | type | required |") + and output.count("| id | any | |") + and output.count("| name | string | |") + and output.count("| age | any | |") + ) + + +def test_schema_summary_with_name_missing_for_some_fields(): + descriptor = { + "fields": [ + {"type": "int", "format": "default"}, + {"type": "int", "format": "default"}, + {"name": "name", "type": "string", "format": "default"}, + ] + } + schema = Schema(descriptor) + output = schema.to_summary() + assert ( + output.count("| name | type | required |") + and output.count("| int | int | |") + and output.count("| int | int | |") + and output.count("| name | string | |") + ) From 2b62dc1e3987012e1bc48b16d938088132512d05 Mon Sep 17 00:00:00 2001 From: shashi gharti Date: Thu, 9 Jun 2022 17:43:22 +0545 Subject: [PATCH 10/10] fixes for failing tests --- tests/report/test_report.py | 11 ++++++++--- tests/report/test_reporttask.py | 11 ++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/report/test_report.py b/tests/report/test_report.py index 8616bfad74..9dc89bc4eb 100644 --- a/tests/report/test_report.py +++ b/tests/report/test_report.py @@ -1,4 +1,7 @@ -from frictionless import validate +from frictionless import validate, helpers + + +IS_UNIX = not helpers.is_platform("windows") def test_program_error_not_found(): @@ -29,11 +32,12 @@ def test_report_summary_invalid(): def test_report_summary_validate_summary_valid(): report = validate("data/capital-valid.csv") output = report.to_summary() + file_size = 50 if IS_UNIX else 56 assert ( output.count("valid") and output.count("Summary") and output.count("File name | data/capital-valid.csv") - and output.count("File size (bytes) | 50 ") + and output.count(f"File size (bytes) | {file_size} ") and output.count("Total Time Taken (sec) | ") ) @@ -41,11 +45,12 @@ def test_report_summary_validate_summary_valid(): def test_report_summary_validate_summary_invalid(): report = validate("data/capital-invalid.csv") output = report.to_summary() + file_size = 171 if IS_UNIX else 183 assert ( output.count("invalid") and output.count("Summary") and output.count("File name | data/capital-invalid.csv") - and output.count("File size (bytes) | 171 ") + and output.count(f"File size (bytes) | {file_size} ") and output.count("Total Time Taken (sec) |") and output.count("Total Errors | 5 ") and output.count("Duplicate Label (duplicate-label) | 1 ") diff --git a/tests/report/test_reporttask.py b/tests/report/test_reporttask.py index a3558a3623..bd3e2f2726 100644 --- a/tests/report/test_reporttask.py +++ b/tests/report/test_reporttask.py @@ -1,12 +1,16 @@ -from frictionless import validate +from frictionless import validate, helpers + + +IS_UNIX = not helpers.is_platform("windows") def test_report_reporttask_summary_valid(): report = validate("data/capital-valid.csv") output = report.tasks[0].to_summary() + file_size = 50 if IS_UNIX else 56 assert ( output.count("File name | data/capital-valid.csv") - and output.count("File size (bytes) | 50 ") + and output.count(f"File size (bytes) | {file_size} ") and output.count("Total Time Taken (sec) | ") ) @@ -14,9 +18,10 @@ def test_report_reporttask_summary_valid(): def test_report_reporttask_summary_invalid(): report = validate("data/capital-invalid.csv") output = report.tasks[0].to_summary() + file_size = 171 if IS_UNIX else 183 assert ( output.count("File name | data/capital-invalid.csv") - and output.count("File size (bytes) | 171 ") + and output.count(f"File size (bytes) | {file_size} ") and output.count("Total Time Taken (sec) |") and output.count("Total Errors | 5 ") and output.count("Duplicate Label (duplicate-label) | 1 ")