Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added new command 'summary' #1127

Merged
merged 12 commits into from
Jun 10, 2022
15 changes: 15 additions & 0 deletions data/fixtures/program/summary/errors.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
+-------+---------+------------+----------------------------------------------------+
| row | field | code | message |
+=======+=========+============+====================================================+
| 4 | 5 | extra-cell | Row at position "4" has an extra value in field at |
| | | | position "5" |
+-------+---------+------------+----------------------------------------------------+
| 7 | 2 | missing- | Row at position "7" has a missing cell in field |
| | | cell | "neighbor_id" at position "2" |
+-------+---------+------------+----------------------------------------------------+
| 7 | 3 | missing- | Row at position "7" has a missing cell in field |
| | | cell | "name" at position "3" |
+-------+---------+------------+----------------------------------------------------+
| 7 | 4 | missing- | Row at position "7" has a missing cell in field |
| | | cell | "population" at position "4" |
+-------+---------+------------+----------------------------------------------------+
52 changes: 52 additions & 0 deletions frictionless/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,10 +634,62 @@ def dicts_to_markdown_table(dicts: List[dict], **kwargs) -> str:
return df.where(df.notnull(), None).to_markdown(index=False)


# TODO:This is a temporary function to use with tabulate as
# tabulate 0.8.9 does not support text wrap
def wrap_text_to_colwidths(list_of_lists: List, colwidths: List = [5, 5, 10, 50]) -> List:
"""Create new list with wrapped text with different column width.
Args:
list_of_lists (List): List of lines
colwidths (List): width for each column

Returns:
List: list of lines with wrapped text

"""
result = []
for row in list_of_lists:
new_row = []
for cell, width in zip(row, colwidths):
cell = str(cell)
wrapped = textwrap.wrap(cell, width=width)
new_row.append("\n".join(wrapped))
result.append(new_row)
return result


def format_bytes(size: int) -> str:
"""Format bytes to larger units"""
units = ["bytes", "KB", "MB", "GB", "TB"]
index = math.floor(math.log2(size) / 10)
if index > len(units):
index = len(units) - 1
return units[index]


def validation_summary(
shashigharti marked this conversation as resolved.
Show resolved Hide resolved
source: str,
time_taken: str,
basepath: str = None,
rows_checked: int = None,
error_list: List = None,
) -> List:
"""Generate summary for validation task"""
file_path = os.path.join(basepath, source) if basepath else source
file_size = "N/A"
unit = None
if os.path.exists(file_path):
file_size = os.path.getsize(file_path)
unit = format_bytes(file_size)
content = [
[f"File name { '' if unit else '(Not Found)' }", source],
[f"File size { f'({unit})' if unit else '' }", file_size],
["Total Time Taken (sec)", time_taken],
]
if rows_checked:
content.append(["Rows Checked(Partial)**", rows_checked])
if error_list:
content.append(["Total Errors", sum(error_list.values())])
for code, count in error_list.items():
content.append([code, count])

return content
1 change: 1 addition & 0 deletions frictionless/program/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .api import program_api
from .describe import program_describe
from .extract import program_extract
from .summary import program_summary
from .main import program, program_main
from .transform import program_transform
from .validate import program_validate
13 changes: 12 additions & 1 deletion frictionless/program/main.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
import sys
import typer
from typing import Optional
from .. import settings


# Program

program = typer.Typer()

# TODO: remove this hack when Typer supports not-found commands catching
# https://github.com/tiangolo/typer/issues/18
class Program(typer.Typer):
def __call__(self, *args, **kwargs):
if sys.argv[1].count("."):
sys.argv = [sys.argv[0], "summary", sys.argv[1]]
return super().__call__(*args, **kwargs)


program = Program()


# Helpers
Expand Down
119 changes: 119 additions & 0 deletions frictionless/program/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import typer
from tabulate import tabulate
from .main import program
from . import common
from .. import helpers
from ..layout import Layout
from ..resource import Resource


@program.command(name="summary")
def program_summary(source: str = common.source):
"""Summary of data source.

It will return schema, sample of the data and validation report for the resource.
"""
# Validate input
if not source:
message = 'Providing "source" is required'
typer.secho(message, err=True, fg=typer.colors.RED, bold=True)
raise typer.Exit(1)
# Infer Resource
try:
resource = Resource(source, layout=Layout(limit_rows=5))
resource.infer()
except Exception as exception:
typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True)
raise typer.Exit(1)
# Describe data
content = [
[field.name, field.type, True if field.required else ""]
for field in resource.schema.fields
]
typer.secho("")
typer.secho("# Describe ", bold=True)
typer.secho("")
typer.secho(tabulate(content, headers=["name", "type", "required"], tablefmt="grid"))
# Extract data
try:
resource.extract()
except Exception as exception:
typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True)
raise typer.Exit(1)
typer.secho("")
typer.secho("# Extract ", bold=True)
typer.secho("")
typer.secho(resource.to_view())
# Validate data
try:
report = resource.validate()
except Exception as exception:
typer.secho(str(exception), err=True, fg=typer.colors.RED, bold=True)
raise typer.Exit(1)
error_content = []
error_list = {}
typer.secho("")
typer.secho("# Validate ", bold=True)
typer.secho("")
for task in report.tasks:
tabular = task.resource.profile == "tabular-data-resource"
prefix = "valid" if task.valid else "invalid"
suffix = "" if tabular else "(non-tabular)"
source = task.resource.path or task.resource.name
# for zipped resources append file name
if task.resource.innerpath:
source = f"{source} => {task.resource.innerpath}"
typer.secho(f"# {'-'*len(prefix)}", bold=True)
typer.secho(f"# {prefix}: {source} {suffix}", bold=True)
typer.secho(f"# {'-'*len(prefix)}", bold=True)
for error in report.tasks[0].errors:
error_content.append(
[
error.get("rowPosition", ""),
error.get("fieldPosition", ""),
error.code,
error.message,
]
)
# error list for summary
error_title = f"{error.name} ({error.code})"
if error_title not in error_list:
error_list[error_title] = 0
error_list[error_title] += 1
if task.partial:
last_row_checked = error.get("rowPosition", "")
error_content = helpers.wrap_text_to_colwidths(error_content)
rows_checked = last_row_checked if task.partial else None
summary_content = helpers.validation_summary(
source,
basepath=task.resource.basepath,
time_taken=task.time,
rows_checked=rows_checked,
error_list=error_list,
)
typer.secho("")
typer.secho("## Summary ", bold=True)
typer.secho("")
typer.secho(
str(
tabulate(
summary_content,
headers=["Description", "Size/Name/Count"],
tablefmt="grid",
)
)
)
if len(error_content) > 0:
typer.secho("")
typer.secho("## Errors ", bold=True)
typer.secho("")
typer.secho(
tabulate(
error_content,
headers=["row", "field", "code", "message"],
tablefmt="grid",
)
)

# Return retcode
raise typer.Exit(code=int(not report.valid))
60 changes: 2 additions & 58 deletions frictionless/program/validate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import sys
import os
import typer
import textwrap
from typing import List
from tabulate import tabulate
from ..actions import validate
Expand Down Expand Up @@ -270,10 +268,10 @@ def program_validate(
error_list[error_title] += 1
if task.partial:
last_row_checked = error.get("rowPosition", "")
content = _wrap_text_to_colwidths(content)
content = helpers.wrap_text_to_colwidths(content)
# summary
rows_checked = last_row_checked if task.partial else None
summary_content = _validation_summary(
summary_content = helpers.validation_summary(
source,
basepath=task.resource.basepath,
time_taken=task.time,
Expand Down Expand Up @@ -311,57 +309,3 @@ def program_validate(

# Return retcode
raise typer.Exit(code=int(not report.valid))


# TODO:This is a temporary function to use with tabulate as
# tabulate 0.8.9 does not support text wrap
def _wrap_text_to_colwidths(
list_of_lists: List, colwidths: List = [5, 5, 10, 50]
) -> List:
"""Create new list with wrapped text with different column width.
Args:
list_of_lists (List): List of lines
colwidths (List): width for each column

Returns:
List: list of lines with wrapped text

"""
result = []
for row in list_of_lists:
new_row = []
for cell, width in zip(row, colwidths):
cell = str(cell)
wrapped = textwrap.wrap(cell, width=width)
new_row.append("\n".join(wrapped))
result.append(new_row)
return result


def _validation_summary(
source: str,
time_taken: str,
basepath: str = None,
rows_checked: int = None,
error_list: List = None,
) -> List:
"""Generate summary for validation task"""
file_path = os.path.join(basepath, source) if basepath else source
file_size = "N/A"
unit = None
if os.path.exists(file_path):
file_size = os.path.getsize(file_path)
unit = helpers.format_bytes(file_size)
content = [
[f"File name { '' if unit else '(Not Found)' }", source],
[f"File size { f'({unit})' if unit else '' }", file_size],
["Total Time Taken (sec)", time_taken],
]
if rows_checked:
content.append(["Rows Checked(Partial)**", rows_checked])
if error_list:
content.append(["Total Errors", sum(error_list.values())])
for code, count in error_list.items():
content.append([code, count])

return content
Loading