Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added new command 'summary' #1127

Merged
merged 12 commits into from
Jun 10, 2022
51 changes: 19 additions & 32 deletions data/fixtures/cli/long-error-messages-976.txt
Original file line number Diff line number Diff line change
@@ -1,33 +1,20 @@
# -------
# invalid: test-tabulator
# -------
## Errors


# Summary

Description Size/Name/Count
------------------------------- -----------------
File name (Not Found) test-tabulator
File size N/A
Total Time Taken (sec)
Total Errors 1
Resource Error (resource-error) 1

# Errors

row field code message
----- ------- --------- -------------------------------------------------
resource- The data resource has an error: "{'format':
error 'inline', 'hashing': 'md5', 'name': 'test-
tabulator', 'profile': 'tabular-data-resource',
'resources': [{'name': 'first-resource', 'path':
'table.xls', 'schema': {'fields': [{'name': 'id',
'type': 'number'}, {'name': 'name', 'type':
'string'}]}}, {'name': 'number-two', 'path':
'table-reverse.csv', 'schema': {'fields':
[{'name': 'id', 'type': 'integer'}, {'name':
'name', 'type': 'string'}]}}], 'scheme': '',
'stats': {'bytes': 0, 'fields': 0, 'hash': '',
'rows': 0}} is not valid under any of the given
schemas" at "" in metadata and at "oneOf" in
profile
+-------+---------+-----------+---------------------------------------------------+
| row | field | code | message |
+=======+=========+===========+===================================================+
| | | resource- | The data resource has an error: "{'format': |
| | | error | 'inline', 'hashing': 'md5', 'name': 'test- |
| | | | tabulator', 'profile': 'tabular-data-resource', |
| | | | 'resources': [{'name': 'first-resource', 'path': |
| | | | 'table.xls', 'schema': {'fields': [{'name': 'id', |
| | | | 'type': 'number'}, {'name': 'name', 'type': |
| | | | 'string'}]}}, {'name': 'number-two', 'path': |
| | | | 'table-reverse.csv', 'schema': {'fields': |
| | | | [{'name': 'id', 'type': 'integer'}, {'name': |
| | | | 'name', 'type': 'string'}]}}], 'scheme': '', |
| | | | 'stats': {'bytes': 0, 'fields': 0, 'hash': '', |
| | | | 'rows': 0}} is not valid under any of the given |
| | | | schemas" at "" in metadata and at "oneOf" in |
| | | | profile |
+-------+---------+-----------+---------------------------------------------------+
52 changes: 8 additions & 44 deletions data/fixtures/cli/zipped-resources-979.txt
Original file line number Diff line number Diff line change
@@ -1,44 +1,8 @@
# -----
# valid: ogd10_energieforschungstatistik_ch.csv
# -----

# Summary

Description Size/Name/Count
---------------------- --------------------------------------
File name ogd10_energieforschungstatistik_ch.csv
File size (KB) 88541
Total Time Taken (sec)
# -------
# invalid: ogd10_catalogs.zip => capital-invalid.csv
# -------


# Summary

Description Size/Name/Count
--------------------------- -----------------------------------------
File name (Not Found) ogd10_catalogs.zip => capital-invalid.csv
File size N/A
Total Time Taken (sec)
Total Errors 1
Schema Error (schema-error) 1

# Errors

row field code message
----- ------- ------- -------------------------------------------------
schema- Schema is not valid: Schemas with duplicate field
error names are not supported

# -----
# valid: ogd10_catalogs.zip => finanzquellen.csv
# -----

# Summary

Description Size/Name/Count
---------------------- ---------------------------------------
File name (Not Found) ogd10_catalogs.zip => finanzquellen.csv
File size N/A
Total Time Taken (sec)
## Errors

+-------+---------+---------+---------------------------------------------------+
| row | field | code | message |
+=======+=========+=========+===================================================+
| | | schema- | Schema is not valid: Schemas with duplicate field |
| | | error | names are not supported |
+-------+---------+---------+---------------------------------------------------+
15 changes: 15 additions & 0 deletions data/fixtures/summary/multiline-errors.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
+-------+---------+------------+----------------------------------------------------+
| row | field | code | message |
+=======+=========+============+====================================================+
| 4 | 5 | extra-cell | Row at position "4" has an extra value in field at |
| | | | position "5" |
+-------+---------+------------+----------------------------------------------------+
| 7 | 2 | missing- | Row at position "7" has a missing cell in field |
| | | cell | "neighbor_id" at position "2" |
+-------+---------+------------+----------------------------------------------------+
| 7 | 3 | missing- | Row at position "7" has a missing cell in field |
| | | cell | "name" at position "3" |
+-------+---------+------------+----------------------------------------------------+
| 7 | 4 | missing- | Row at position "7" has a missing cell in field |
| | | cell | "population" at position "4" |
+-------+---------+------------+----------------------------------------------------+
9 changes: 9 additions & 0 deletions data/fixtures/summary/multiline-scheme-error.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
## Errors

+-------+---------+---------+---------------------------------------------------+
| row | field | code | message |
+=======+=========+=========+===================================================+
| | | scheme- | The data source could not be successfully loaded: |
| | | error | [Errno 2] No such file or directory: |
| | | | 'data/countriess.csv' |
+-------+---------+---------+---------------------------------------------------+
1 change: 1 addition & 0 deletions frictionless/program/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .api import program_api
from .describe import program_describe
from .extract import program_extract
from .summary import program_summary
from .main import program, program_main
from .transform import program_transform
from .validate import program_validate
13 changes: 12 additions & 1 deletion frictionless/program/main.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
import sys
import typer
from typing import Optional
from .. import settings


# Program

program = typer.Typer()

# TODO: remove this hack when Typer supports not-found commands catching
# https://github.com/tiangolo/typer/issues/18
class Program(typer.Typer):
def __call__(self, *args, **kwargs):
if sys.argv[1].count("."):
sys.argv = [sys.argv[0], "summary", sys.argv[1]]
return super().__call__(*args, **kwargs)


program = Program()


# Helpers
Expand Down
44 changes: 44 additions & 0 deletions frictionless/program/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import typer
from .main import program
from . import common
from ..resource import Resource


@program.command(name="summary")
def program_summary(source: str = common.source):
"""Summary of data source.

It will return schema, sample of the data and validation report for the resource.
"""
# Validate input
if not source:
message = 'Providing "source" is required'
typer.secho(message, err=True, fg=typer.colors.RED, bold=True)
raise typer.Exit(1)
# Infer Resource
try:
resource = Resource(source)
resource.infer()
except Exception as exception:
typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True)
raise typer.Exit(1)
typer.secho("")
typer.secho("# Describe ", bold=True)
typer.secho("")
typer.secho(str(resource.schema.to_summary()))
typer.secho("")
typer.secho("# Extract ", bold=True)
typer.secho("")
typer.secho(str(resource.to_view()))
# Validate
try:
report = resource.validate()
except Exception as exception:
typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True)
raise typer.Exit(1)
typer.secho("")
typer.secho("# Validate ", bold=True)
typer.secho(str(report.to_summary()))

# Return retcode
raise typer.Exit(code=int(not report.valid))
136 changes: 3 additions & 133 deletions frictionless/program/validate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import sys
import os
import typer
import textwrap
from typing import List
from tabulate import tabulate
from ..actions import validate
Expand Down Expand Up @@ -217,7 +215,7 @@ def program_validate(
typer.secho(content)
raise typer.Exit()

# Return report
# Return validation report errors
if report.errors:
content = []
if is_stdin:
Expand All @@ -232,136 +230,8 @@ def program_validate(
str(tabulate(content, headers=["code", "message"], tablefmt="simple"))
)

# Return tables
prev_invalid = False
for number, task in enumerate(report.tasks, start=1):
tabular = task.resource.profile == "tabular-data-resource"
if number != 1 and prev_invalid:
typer.secho("")
prefix = "valid" if task.valid else "invalid"
suffix = "" if tabular else "(non-tabular)"
source = task.resource.path or task.resource.name
# for zipped resources append file name
if task.resource.innerpath:
source = f"{source} => {task.resource.innerpath}"
if is_stdin:
source = "stdin"
typer.secho(f"# {'-'*len(prefix)}", bold=True)
typer.secho(f"# {prefix}: {source} {suffix}", bold=True)
typer.secho(f"# {'-'*len(prefix)}", bold=True)
error_list = {}
if task.errors:
prev_invalid = True
typer.secho("")
content = []
for error in task.errors:
content.append(
[
error.get("rowPosition", ""),
error.get("fieldPosition", ""),
error.code,
error.message,
]
)
# error list for summary
error_title = f"{error.name} ({error.code})"
if error_title not in error_list:
error_list[error_title] = 0
error_list[error_title] += 1
if task.partial:
last_row_checked = error.get("rowPosition", "")
content = _wrap_text_to_colwidths(content)
# summary
rows_checked = last_row_checked if task.partial else None
summary_content = _validation_summary(
source,
basepath=task.resource.basepath,
time_taken=task.time,
rows_checked=rows_checked,
error_list=error_list,
)
typer.echo("\n# Summary \n")
if task.partial:
typer.echo(
"The document was partially validated because of one of the limits"
)
typer.echo("* limit errors")
typer.echo("* memory Limit \n")
typer.secho(
str(
tabulate(
summary_content,
headers=["Description", "Size/Name/Count"],
tablefmt="simple",
)
)
)
# errors
if task.errors:
typer.echo("\n# Errors \n")
typer.secho(
str(
tabulate(
content,
headers=["row", "field", "code", "message"],
tablefmt="simple",
)
)
)
# Return validation report summary and tables
typer.secho(str(report.to_summary()))

# Return retcode
raise typer.Exit(code=int(not report.valid))


# TODO:This is a temporary function to use with tabulate as
# tabulate 0.8.9 does not support text wrap
def _wrap_text_to_colwidths(
list_of_lists: List, colwidths: List = [5, 5, 10, 50]
) -> List:
"""Create new list with wrapped text with different column width.
Args:
list_of_lists (List): List of lines
colwidths (List): width for each column

Returns:
List: list of lines with wrapped text

"""
result = []
for row in list_of_lists:
new_row = []
for cell, width in zip(row, colwidths):
cell = str(cell)
wrapped = textwrap.wrap(cell, width=width)
new_row.append("\n".join(wrapped))
result.append(new_row)
return result


def _validation_summary(
source: str,
time_taken: str,
basepath: str = None,
rows_checked: int = None,
error_list: List = None,
) -> List:
"""Generate summary for validation task"""
file_path = os.path.join(basepath, source) if basepath else source
file_size = "N/A"
unit = None
if os.path.exists(file_path):
file_size = os.path.getsize(file_path)
unit = helpers.format_bytes(file_size)
content = [
[f"File name { '' if unit else '(Not Found)' }", source],
[f"File size { f'({unit})' if unit else '' }", file_size],
["Total Time Taken (sec)", time_taken],
]
if rows_checked:
content.append(["Rows Checked(Partial)**", rows_checked])
if error_list:
content.append(["Total Errors", sum(error_list.values())])
for code, count in error_list.items():
content.append([code, count])

return content
Loading