Skip to content

Commit

Permalink
Added new CLI command summary (#1127)
Browse files Browse the repository at this point in the history
* Added command summary

* Added command named summary
* Added to_summary function to ReportTask
* Added tests for both 'summary' command and 'to_summary' function

* fixes for failing tests

* Revised code

* Rearranged code and placed it in the relevant places "separation of concerns"
* Made changes to tests and added new tests
* Revised summary command code

* added test for file not found error

* refactor validate command

* replaced the report table display code with new to_summary function
* fixed failing tests

* removed print statement

* made validation_summary part of the report class

* fixes to failing test

* simplified test output for multiline text

* * Refactored display summary code in report.py class/file. Added validation_summary as to_summary to report task class
* Changed the code of report.to_summary to use reporttask summary function
* Removed unrelated tests code added in previous commit and added new ones for schema, resource, reporttask and report
* Organized tests files for summary feature

* fixes for failing tests
  • Loading branch information
shashigharti authored Jun 10, 2022
1 parent b25c896 commit 8ba2621
Show file tree
Hide file tree
Showing 17 changed files with 687 additions and 221 deletions.
51 changes: 19 additions & 32 deletions data/fixtures/cli/long-error-messages-976.txt
Original file line number Diff line number Diff line change
@@ -1,33 +1,20 @@
# -------
# invalid: test-tabulator
# -------
## Errors


# Summary

Description Size/Name/Count
------------------------------- -----------------
File name (Not Found) test-tabulator
File size N/A
Total Time Taken (sec)
Total Errors 1
Resource Error (resource-error) 1

# Errors

row field code message
----- ------- --------- -------------------------------------------------
resource- The data resource has an error: "{'format':
error 'inline', 'hashing': 'md5', 'name': 'test-
tabulator', 'profile': 'tabular-data-resource',
'resources': [{'name': 'first-resource', 'path':
'table.xls', 'schema': {'fields': [{'name': 'id',
'type': 'number'}, {'name': 'name', 'type':
'string'}]}}, {'name': 'number-two', 'path':
'table-reverse.csv', 'schema': {'fields':
[{'name': 'id', 'type': 'integer'}, {'name':
'name', 'type': 'string'}]}}], 'scheme': '',
'stats': {'bytes': 0, 'fields': 0, 'hash': '',
'rows': 0}} is not valid under any of the given
schemas" at "" in metadata and at "oneOf" in
profile
+-------+---------+-----------+---------------------------------------------------+
| row | field | code | message |
+=======+=========+===========+===================================================+
| | | resource- | The data resource has an error: "{'format': |
| | | error | 'inline', 'hashing': 'md5', 'name': 'test- |
| | | | tabulator', 'profile': 'tabular-data-resource', |
| | | | 'resources': [{'name': 'first-resource', 'path': |
| | | | 'table.xls', 'schema': {'fields': [{'name': 'id', |
| | | | 'type': 'number'}, {'name': 'name', 'type': |
| | | | 'string'}]}}, {'name': 'number-two', 'path': |
| | | | 'table-reverse.csv', 'schema': {'fields': |
| | | | [{'name': 'id', 'type': 'integer'}, {'name': |
| | | | 'name', 'type': 'string'}]}}], 'scheme': '', |
| | | | 'stats': {'bytes': 0, 'fields': 0, 'hash': '', |
| | | | 'rows': 0}} is not valid under any of the given |
| | | | schemas" at "" in metadata and at "oneOf" in |
| | | | profile |
+-------+---------+-----------+---------------------------------------------------+
52 changes: 8 additions & 44 deletions data/fixtures/cli/zipped-resources-979.txt
Original file line number Diff line number Diff line change
@@ -1,44 +1,8 @@
# -----
# valid: ogd10_energieforschungstatistik_ch.csv
# -----

# Summary

Description Size/Name/Count
---------------------- --------------------------------------
File name ogd10_energieforschungstatistik_ch.csv
File size (KB) 88541
Total Time Taken (sec)
# -------
# invalid: ogd10_catalogs.zip => capital-invalid.csv
# -------


# Summary

Description Size/Name/Count
--------------------------- -----------------------------------------
File name (Not Found) ogd10_catalogs.zip => capital-invalid.csv
File size N/A
Total Time Taken (sec)
Total Errors 1
Schema Error (schema-error) 1

# Errors

row field code message
----- ------- ------- -------------------------------------------------
schema- Schema is not valid: Schemas with duplicate field
error names are not supported

# -----
# valid: ogd10_catalogs.zip => finanzquellen.csv
# -----

# Summary

Description Size/Name/Count
---------------------- ---------------------------------------
File name (Not Found) ogd10_catalogs.zip => finanzquellen.csv
File size N/A
Total Time Taken (sec)
## Errors

+-------+---------+---------+---------------------------------------------------+
| row | field | code | message |
+=======+=========+=========+===================================================+
| | | schema- | Schema is not valid: Schemas with duplicate field |
| | | error | names are not supported |
+-------+---------+---------+---------------------------------------------------+
15 changes: 15 additions & 0 deletions data/fixtures/summary/multiline-errors.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
+-------+---------+------------+----------------------------------------------------+
| row | field | code | message |
+=======+=========+============+====================================================+
| 4 | 5 | extra-cell | Row at position "4" has an extra value in field at |
| | | | position "5" |
+-------+---------+------------+----------------------------------------------------+
| 7 | 2 | missing- | Row at position "7" has a missing cell in field |
| | | cell | "neighbor_id" at position "2" |
+-------+---------+------------+----------------------------------------------------+
| 7 | 3 | missing- | Row at position "7" has a missing cell in field |
| | | cell | "name" at position "3" |
+-------+---------+------------+----------------------------------------------------+
| 7 | 4 | missing- | Row at position "7" has a missing cell in field |
| | | cell | "population" at position "4" |
+-------+---------+------------+----------------------------------------------------+
9 changes: 9 additions & 0 deletions data/fixtures/summary/multiline-scheme-error.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
## Errors

+-------+---------+---------+---------------------------------------------------+
| row | field | code | message |
+=======+=========+=========+===================================================+
| | | scheme- | The data source could not be successfully loaded: |
| | | error | [Errno 2] No such file or directory: |
| | | | 'data/countriess.csv' |
+-------+---------+---------+---------------------------------------------------+
1 change: 1 addition & 0 deletions frictionless/program/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .api import program_api
from .describe import program_describe
from .extract import program_extract
from .summary import program_summary
from .main import program, program_main
from .transform import program_transform
from .validate import program_validate
13 changes: 12 additions & 1 deletion frictionless/program/main.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
import sys
import typer
from typing import Optional
from .. import settings


# Program

program = typer.Typer()

# TODO: remove this hack when Typer supports not-found commands catching
# https://github.com/tiangolo/typer/issues/18
class Program(typer.Typer):
def __call__(self, *args, **kwargs):
if sys.argv[1].count("."):
sys.argv = [sys.argv[0], "summary", sys.argv[1]]
return super().__call__(*args, **kwargs)


program = Program()


# Helpers
Expand Down
44 changes: 44 additions & 0 deletions frictionless/program/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import typer
from .main import program
from . import common
from ..resource import Resource


@program.command(name="summary")
def program_summary(source: str = common.source):
"""Summary of data source.
It will return schema, sample of the data and validation report for the resource.
"""
# Validate input
if not source:
message = 'Providing "source" is required'
typer.secho(message, err=True, fg=typer.colors.RED, bold=True)
raise typer.Exit(1)
# Infer Resource
try:
resource = Resource(source)
resource.infer()
except Exception as exception:
typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True)
raise typer.Exit(1)
typer.secho("")
typer.secho("# Describe ", bold=True)
typer.secho("")
typer.secho(str(resource.schema.to_summary()))
typer.secho("")
typer.secho("# Extract ", bold=True)
typer.secho("")
typer.secho(str(resource.to_view()))
# Validate
try:
report = resource.validate()
except Exception as exception:
typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True)
raise typer.Exit(1)
typer.secho("")
typer.secho("# Validate ", bold=True)
typer.secho(str(report.to_summary()))

# Return retcode
raise typer.Exit(code=int(not report.valid))
136 changes: 3 additions & 133 deletions frictionless/program/validate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import sys
import os
import typer
import textwrap
from typing import List
from tabulate import tabulate
from ..actions import validate
Expand Down Expand Up @@ -217,7 +215,7 @@ def program_validate(
typer.secho(content)
raise typer.Exit()

# Return report
# Return validation report errors
if report.errors:
content = []
if is_stdin:
Expand All @@ -232,136 +230,8 @@ def program_validate(
str(tabulate(content, headers=["code", "message"], tablefmt="simple"))
)

# Return tables
prev_invalid = False
for number, task in enumerate(report.tasks, start=1):
tabular = task.resource.profile == "tabular-data-resource"
if number != 1 and prev_invalid:
typer.secho("")
prefix = "valid" if task.valid else "invalid"
suffix = "" if tabular else "(non-tabular)"
source = task.resource.path or task.resource.name
# for zipped resources append file name
if task.resource.innerpath:
source = f"{source} => {task.resource.innerpath}"
if is_stdin:
source = "stdin"
typer.secho(f"# {'-'*len(prefix)}", bold=True)
typer.secho(f"# {prefix}: {source} {suffix}", bold=True)
typer.secho(f"# {'-'*len(prefix)}", bold=True)
error_list = {}
if task.errors:
prev_invalid = True
typer.secho("")
content = []
for error in task.errors:
content.append(
[
error.get("rowPosition", ""),
error.get("fieldPosition", ""),
error.code,
error.message,
]
)
# error list for summary
error_title = f"{error.name} ({error.code})"
if error_title not in error_list:
error_list[error_title] = 0
error_list[error_title] += 1
if task.partial:
last_row_checked = error.get("rowPosition", "")
content = _wrap_text_to_colwidths(content)
# summary
rows_checked = last_row_checked if task.partial else None
summary_content = _validation_summary(
source,
basepath=task.resource.basepath,
time_taken=task.time,
rows_checked=rows_checked,
error_list=error_list,
)
typer.echo("\n# Summary \n")
if task.partial:
typer.echo(
"The document was partially validated because of one of the limits"
)
typer.echo("* limit errors")
typer.echo("* memory Limit \n")
typer.secho(
str(
tabulate(
summary_content,
headers=["Description", "Size/Name/Count"],
tablefmt="simple",
)
)
)
# errors
if task.errors:
typer.echo("\n# Errors \n")
typer.secho(
str(
tabulate(
content,
headers=["row", "field", "code", "message"],
tablefmt="simple",
)
)
)
# Return validation report summary and tables
typer.secho(str(report.to_summary()))

# Return retcode
raise typer.Exit(code=int(not report.valid))


# TODO:This is a temporary function to use with tabulate as
# tabulate 0.8.9 does not support text wrap
def _wrap_text_to_colwidths(
list_of_lists: List, colwidths: List = [5, 5, 10, 50]
) -> List:
"""Create new list with wrapped text with different column width.
Args:
list_of_lists (List): List of lines
colwidths (List): width for each column
Returns:
List: list of lines with wrapped text
"""
result = []
for row in list_of_lists:
new_row = []
for cell, width in zip(row, colwidths):
cell = str(cell)
wrapped = textwrap.wrap(cell, width=width)
new_row.append("\n".join(wrapped))
result.append(new_row)
return result


def _validation_summary(
source: str,
time_taken: str,
basepath: str = None,
rows_checked: int = None,
error_list: List = None,
) -> List:
"""Generate summary for validation task"""
file_path = os.path.join(basepath, source) if basepath else source
file_size = "N/A"
unit = None
if os.path.exists(file_path):
file_size = os.path.getsize(file_path)
unit = helpers.format_bytes(file_size)
content = [
[f"File name { '' if unit else '(Not Found)' }", source],
[f"File size { f'({unit})' if unit else '' }", file_size],
["Total Time Taken (sec)", time_taken],
]
if rows_checked:
content.append(["Rows Checked(Partial)**", rows_checked])
if error_list:
content.append(["Total Errors", sum(error_list.values())])
for code, count in error_list.items():
content.append([code, count])

return content
Loading

0 comments on commit 8ba2621

Please sign in to comment.