diff --git a/python/tests/api/writer/test_whylabs.py b/python/tests/api/writer/test_whylabs.py index 87ae5bea99..1f585ea583 100644 --- a/python/tests/api/writer/test_whylabs.py +++ b/python/tests/api/writer/test_whylabs.py @@ -342,22 +342,6 @@ def test_option_will_overwrite_defaults(self) -> None: assert writer._whylabs_client._dataset_id == "new_dataset_id" assert writer.key_id == "newkeynewk" - def test_api_key_prefers_parameter_over_env_var(self, results, caplog): - with pytest.raises(ValueError): - results.writer("whylabs").option(org_id="org_id", api_key="api_key_123.foo").write(dataset_id="dataset_id") - - def test_writer_accepts_dest_param(self, results, caplog): - # TODO: inspect error or mock better to avoid network call and keep test focused. - with pytest.raises(ValueError): - results.writer("whylabs").option(api_key="bad_key_format").write(dataset_id="dataset_id", dest="tmp") - - def test_write_response(self, results): - with pytest.raises(ValueError): - response = ( - results.writer("whylabs").option(api_key="bad_key_format").write(dataset_id="dataset_id", dest="tmp") - ) - assert response[0] is True - def test_changing_api_key_works(self) -> None: # # Defaults diff --git a/python/whylogs/__init__.py b/python/whylogs/__init__.py index 561229fc78..e36c2ab4ea 100644 --- a/python/whylogs/__init__.py +++ b/python/whylogs/__init__.py @@ -26,7 +26,6 @@ reader, write, ) -from .api.usage_stats import emit_usage as __emit_usage_stats from .api.whylabs import init from .core import DatasetProfileView from .migration.converters import v0_to_v1_view @@ -65,5 +64,3 @@ def package_version(package: str = __package__) -> str: __version__, init, ] - -__emit_usage_stats("import") diff --git a/python/whylogs/api/fugue/__init__.py b/python/whylogs/api/fugue/__init__.py index 558f42147e..8cf0b28ed4 100644 --- a/python/whylogs/api/fugue/__init__.py +++ b/python/whylogs/api/fugue/__init__.py @@ -1,7 +1,3 @@ -# flake8: noqa -from whylogs.api.usage_stats import emit_usage +from .profiler import fugue_profile -# This import has a side effect -from .profiler import fugue_profile # type: ignore - -emit_usage("fugue") +assert fugue_profile is not None diff --git a/python/whylogs/api/logger/__init__.py b/python/whylogs/api/logger/__init__.py index 15f666173c..3653d0914e 100644 --- a/python/whylogs/api/logger/__init__.py +++ b/python/whylogs/api/logger/__init__.py @@ -20,7 +20,6 @@ _log_segment, ) from whylogs.api.logger.transient import TransientLogger -from whylogs.api.usage_stats import emit_usage from whylogs.api.whylabs.session.notebook_logger import ( notebook_session_log, notebook_session_log_comparison, @@ -55,7 +54,6 @@ def log( ) -> ResultSet: if multiple is not None: result_sets: Dict[str, ResultSet] = {} - emit_usage("multiple") for alias, data in multiple.items(): result_set = TransientLogger(schema=schema).log(data, trace_id=trace_id) if dataset_timestamp is not None: diff --git a/python/whylogs/api/pyspark/experimental/profiler.py b/python/whylogs/api/pyspark/experimental/profiler.py index 9cd0a58842..3b28aaf578 100644 --- a/python/whylogs/api/pyspark/experimental/profiler.py +++ b/python/whylogs/api/pyspark/experimental/profiler.py @@ -4,7 +4,6 @@ from typing import Dict, Iterable, Optional, Tuple import whylogs as why -from whylogs.api.usage_stats import emit_usage from whylogs.core import DatasetSchema from whylogs.core.metrics.metrics import conf from whylogs.core.stubs import pd @@ -12,7 +11,6 @@ from whylogs.core.view.dataset_profile_view import DatasetProfileView logger = getLogger(__name__) -emit_usage("pyspark") try: # type: ignore from pyspark.ml.functions import vector_to_array diff --git a/python/whylogs/api/pyspark/experimental/segmented_profiler.py b/python/whylogs/api/pyspark/experimental/segmented_profiler.py index 4d4bf9af32..52763e3b0c 100644 --- a/python/whylogs/api/pyspark/experimental/segmented_profiler.py +++ b/python/whylogs/api/pyspark/experimental/segmented_profiler.py @@ -8,7 +8,6 @@ import whylogs as why from whylogs.api.logger.result_set import ResultSet, SegmentedResultSet from whylogs.api.pyspark.experimental.profiler import COL_NAME_FIELD, COL_PROFILE_FIELD -from whylogs.api.usage_stats import emit_usage from whylogs.core import DatasetSchema from whylogs.core.segment import Segment from whylogs.core.segmentation_partition import SegmentationPartition @@ -17,7 +16,6 @@ from whylogs.core.view.dataset_profile_view import DatasetProfileView logger = getLogger(__name__) -emit_usage("pyspark") try: # type: ignore from pyspark.ml.functions import vector_to_array diff --git a/python/whylogs/api/usage_stats/__init__.py b/python/whylogs/api/usage_stats/__init__.py deleted file mode 100644 index 82061e4fe7..0000000000 --- a/python/whylogs/api/usage_stats/__init__.py +++ /dev/null @@ -1,218 +0,0 @@ -import atexit -import hashlib -import http.client -import json -import logging -import os -import site -import socket -import sys -import uuid -from datetime import datetime -from threading import Thread -from typing import Any, Dict, List, Optional -from urllib import request - -import whylogs - -_TELEMETRY_ENDPOINT = "https://stats.whylogs.com/" -if os.getenv("TELEMETRY_DEV"): - _TELEMETRY_ENDPOINT = "https://staging-stats.whylogs.com" -_TIMESTAMP_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" -logger = logging.getLogger(__name__) - -ANALYTICS_OPT_OUT = "WHYLOGS_NO_ANALYTICS" - -# Flag to disable it internally -_TELEMETRY_DISABLED = False -_TRACKED_EVENTS: Dict[str, bool] = {} -_SITE_PACKAGES: List[str] = [] - -try: - # fix for virtualenv lack of definition for getsitepackages - if hasattr(site, "getsitepackages"): - _SITE_PACKAGES = site.getsitepackages() - else: - from distutils.sysconfig import get_python_lib - - _SITE_PACKAGES = [get_python_lib()] -except: # noqa - logger.debug("Encountered exception when checking site packages") - -if os.getenv(ANALYTICS_OPT_OUT) is not None: - logger.debug("Opted out of usage statistics. Skipping.") - _TELEMETRY_DISABLED = True - -try: - if os.path.exists(os.path.expanduser("~/.whylogs/disable_telemetry")): - _TELEMETRY_DISABLED = True -except: # noqa - logger.info("Encounter exception when checking file system. Disable telemetry by default") - _TELEMETRY_DISABLED = True - - -def emit_usage(event: str) -> None: - global _TELEMETRY_DISABLED - global _TRACKED_EVENTS - if _TELEMETRY_DISABLED: - return - if _TRACKED_EVENTS.get(event): - return - _TRACKED_EVENTS[event] = True - - t = Thread(target=_do_emit_usage, args=(event,)) - t.start() - - atexit.register(t.join) - - -_metadata = None -_identity = None - - -def _do_emit_usage(event: str) -> None: - global _TELEMETRY_DISABLED - if _TELEMETRY_DISABLED: - logger.debug("Opted out of usage statistics. Skipping.") - return - - logger.debug("Telemetry opted in. Emitting usage statistics") - - global _identity - global _metadata - if _identity is None: - _identity = _calc_identity() - if _metadata is None: - _metadata = _build_metadata() - - _send_stats_event(event, _identity, _metadata) - - -def _calc_identity() -> str: - try: - hashed_computer_name = hashlib.sha512(bytes(socket.gethostname(), encoding="utf8")) - return hashed_computer_name.hexdigest() - except socket.timeout as exc: - logger.debug( - "Socket timeout when trying to get the computer name. Exception: %s", - exc, - ) - return uuid.uuid4().hex - - -def _build_metadata() -> Dict[str, Any]: - """Hash system and project data to send to our stats endpoint.""" - - if hasattr(whylogs, "__version__"): - project_version = whylogs.__version__ - else: - import whylogs as why - - project_version = why.package_version() - (major, minor, macro, _, _) = sys.version_info - - metadata = { - "project_version": project_version, - "python_version": f"{major}.{minor}.{macro}", - "python_version_full": sys.version, - "terminal": _get_terminal_mode(), - "os": sys.platform, - "conda": ("CONDA_DEFAULT_ENV" in os.environ), - "venv": ("VIRTUAL_ENV" in os.environ), - "environment": _get_environment(), - } - - # track various integrations - integrations = { - "numpy": _has_lib("numpy"), - "pandas": _has_lib("pandas"), - "mlflow": _has_lib("mlflow"), - "dask": _has_lib("dask"), - "ray": _has_lib("ray"), - "airflow": _has_lib("airflow"), - "pyspark": _has_lib("pyspark"), - "flyte": _has_lib("flyte"), - "kafka": _has_lib("kafka"), - "langkit": _has_lib("langkit"), - } - for k in list(integrations.keys()): - if integrations.get(k) is False: - integrations.pop(k) - - # add integration metadata - metadata.update(integrations) - return metadata - - -def _send_stats_event(event_name: str, identity: str, properties: Optional[Dict[str, Any]] = None) -> None: - data = { - "identity": identity, - "event": event_name, - "timestamp": datetime.utcnow().strftime(_TIMESTAMP_FORMAT), - "properties": properties or {}, - } - global _TELEMETRY_DISABLED - json_data = json.dumps(data).encode() - req = request.Request(_TELEMETRY_ENDPOINT, data=json_data, method="POST") - req.add_header("Content-Type", "application/json") - - resp: http.client.HTTPResponse = None # type: ignore - try: - resp = request.urlopen(req, timeout=3) - if resp.status != 200: - logger.info("Unable to send usage stats. Disabling whylogs api usage collection.") - _TELEMETRY_DISABLED = True - logger.debug("Response: %s", resp.read()) - except: # noqa - logger.info("Connection error. Skip whylogs api usage collection.") - _TELEMETRY_DISABLED = True - - finally: - if resp is not None: - resp.close() - - -def _get_terminal_mode() -> str: - try: - from IPython.core.getipython import get_ipython # type: ignore - - ipython = get_ipython() - if ipython is not None: - return ipython.__class__.__name__ - except: # noqa - pass - - if hasattr(sys, "ps1"): - return "shell" - else: - return "headless" - - -def _get_environment() -> str: - environments_dict = { - "GITHUB_ACTION": "github_action", - "GITLAB_CI": "gitlab_ci", - "BINDER_PORT": "binder", - "PYCHARM_HOSTED": "pycharm", - "SM_CURRENT_HOST": "sagemaker", - "DATABRICKS_RUNTIME_VERSION": "databricks", - "COLAB_GPU": "colab", - "KAGGLE_KERNEL_RUN_TYPE": "kaggle", - "DEEPNOTE_PROJECT_ID": "deepnote", - } - - for key, value in environments_dict.items(): - if key in os.environ: - return value - return "unknown" - - -def _has_lib(lib_name: str) -> bool: - try: - for p in _SITE_PACKAGES: - if os.path.exists(os.path.join(p, lib_name)): - return True - except: # noqa - pass - - return False diff --git a/python/whylogs/api/whylabs/session/session.py b/python/whylogs/api/whylabs/session/session.py index 42a2b5f884..2ef12bdec8 100644 --- a/python/whylogs/api/whylabs/session/session.py +++ b/python/whylogs/api/whylabs/session/session.py @@ -77,14 +77,12 @@ def __init__(self, config: SessionConfig) -> None: If neither exist then this will attempt to create a new session and store the id in the config, which does require a successful service call to whylabs. """ - from whylogs.api.usage_stats import emit_usage super().__init__(config) # Using lazy initialization to work around circular dependency issues self._whylabs_session_api = Lazy(self.__create_session_api) self._user_guid = self._get_or_create_user_guid() - emit_usage("guest_session") def __create_session_api(self) -> SessionsApi: from whylogs.api.whylabs.session.whylabs_client_cache import ClientCacheConfig @@ -255,15 +253,12 @@ def upload_batch_profile(self, profile: ResultSet) -> Union[UploadResult, NotSup class ApiKeySession(Session): def __init__(self, config: SessionConfig) -> None: - from whylogs.api.usage_stats import emit_usage - super().__init__(config) self.api_key = config.get_api_key() self.org_id = config.get_org_id() # Using lazy initialization to work around circular dependency issues self._whylabs_log_api = Lazy(partial(self.__create_log_api, config)) - emit_usage("api_key_session") def __create_log_api(self, config: SessionConfig) -> LogApi: from whylogs.api.whylabs.session.whylabs_client_cache import ClientCacheConfig diff --git a/python/whylogs/api/whylabs/session/whylabs_client_cache.py b/python/whylogs/api/whylabs/session/whylabs_client_cache.py index 8c37a3901c..f0081afc9f 100644 --- a/python/whylogs/api/whylabs/session/whylabs_client_cache.py +++ b/python/whylogs/api/whylabs/session/whylabs_client_cache.py @@ -22,10 +22,6 @@ def __call__(self, config: Configuration) -> None: def _validate_api_key(self, api_key: Optional[str]) -> str: if api_key is None: raise ValueError("Missing API key. Set it via WHYLABS_API_KEY environment variable or as an api_key option") - if len(api_key) < 12: - raise ValueError("API key too short") - if api_key[10] != ".": - raise ValueError("Invalid format. Expecting a dot at an index 10") return api_key[:10] diff --git a/python/whylogs/api/writer/mlflow.py b/python/whylogs/api/writer/mlflow.py index a46ec9879a..9bd7045343 100644 --- a/python/whylogs/api/writer/mlflow.py +++ b/python/whylogs/api/writer/mlflow.py @@ -5,7 +5,6 @@ import mlflow -from whylogs.api.usage_stats import emit_usage from whylogs.api.writer import Writer from whylogs.api.writer.writer import _Writable from whylogs.core.utils import deprecated_alias @@ -18,7 +17,6 @@ def __init__(self) -> None: self._file_dir = "whylogs" self._file_name = None self._end_run = True - emit_usage("mlflow_writer") @deprecated_alias(profile="file") def write( diff --git a/python/whylogs/api/writer/s3.py b/python/whylogs/api/writer/s3.py index 0fee67a155..e569f4a95b 100644 --- a/python/whylogs/api/writer/s3.py +++ b/python/whylogs/api/writer/s3.py @@ -6,7 +6,6 @@ from botocore.client import BaseClient from botocore.exceptions import ClientError -from whylogs.api.usage_stats import emit_usage from whylogs.api.writer import Writer from whylogs.api.writer.writer import _Writable from whylogs.core.utils import deprecated_alias @@ -63,7 +62,6 @@ def __init__( self.base_prefix = base_prefix or "profile" self.bucket_name = bucket_name or "" self.object_name = object_name or None - emit_usage("s3_writer") @deprecated_alias(profile="file") def write( diff --git a/python/whylogs/api/writer/whylabs_client.py b/python/whylogs/api/writer/whylabs_client.py index d85baefedb..5d60ba081d 100644 --- a/python/whylogs/api/writer/whylabs_client.py +++ b/python/whylogs/api/writer/whylabs_client.py @@ -100,7 +100,9 @@ def _giveup(e) -> bool: - return (e.status not in _RETRY_CODES,) # type: ignore + result = e.status not in _RETRY_CODES + logger.warning(f"whylabs client communication error: {e}, giveup: {result}") + return result def _get_column_names(x: Union[DatasetProfile, DatasetProfileView, SegmentedDatasetProfileView, ResultSet]) -> Set[str]: diff --git a/python/whylogs/core/constraints/__init__.py b/python/whylogs/core/constraints/__init__.py index d74894ab5c..ef0e5c1a95 100644 --- a/python/whylogs/core/constraints/__init__.py +++ b/python/whylogs/core/constraints/__init__.py @@ -1,5 +1,3 @@ -from whylogs.api.usage_stats import emit_usage - from .metric_constraints import ( Constraints, ConstraintsBuilder, @@ -19,5 +17,3 @@ MetricsSelector, PrefixCondition, ] - -emit_usage("metric_constraints") diff --git a/python/whylogs/core/constraints/factories/__init__.py b/python/whylogs/core/constraints/factories/__init__.py index 2715a21747..100019b6b4 100644 --- a/python/whylogs/core/constraints/factories/__init__.py +++ b/python/whylogs/core/constraints/factories/__init__.py @@ -1,5 +1,3 @@ -from whylogs.api.usage_stats import emit_usage - from .cardinality_metrics import distinct_number_in_range from .condition_counts import ( condition_count_below, @@ -65,5 +63,3 @@ condition_count_below, column_is_probably_unique, ] - -emit_usage("constraints_factories") diff --git a/python/whylogs/core/segmentation_partition.py b/python/whylogs/core/segmentation_partition.py index ae087a6dbd..d9a67c5a4e 100644 --- a/python/whylogs/core/segmentation_partition.py +++ b/python/whylogs/core/segmentation_partition.py @@ -4,7 +4,6 @@ from dataclasses import dataclass, field from typing import Callable, List, Mapping, Optional -from whylogs.api.usage_stats import emit_usage from whylogs.core.projectors import FieldProjector logger = logging.getLogger(__name__) @@ -81,6 +80,4 @@ def __hash__(self): def segment_on_column(column_name: str) -> Mapping[str, SegmentationPartition]: - emit_usage("segment_on_column") - return {column_name: SegmentationPartition(name=column_name, mapper=ColumnMapperFunction(col_names=[column_name]))} diff --git a/python/whylogs/core/validators/__init__.py b/python/whylogs/core/validators/__init__.py index 016bd5b83a..c5dd7698d5 100644 --- a/python/whylogs/core/validators/__init__.py +++ b/python/whylogs/core/validators/__init__.py @@ -1,4 +1,3 @@ -from whylogs.api.usage_stats import emit_usage from whylogs.core.validators.condition_validator import ConditionValidator from whylogs.core.validators.validator import Validator @@ -7,5 +6,3 @@ ConditionValidator, Validator, ] - -emit_usage("condition_validators") diff --git a/python/whylogs/core/validators/condition_validator.py b/python/whylogs/core/validators/condition_validator.py index 3eb5a38ddc..1490dce515 100644 --- a/python/whylogs/core/validators/condition_validator.py +++ b/python/whylogs/core/validators/condition_validator.py @@ -35,9 +35,6 @@ class ConditionValidator(Validator): sample_size: int = 10 def __post_init__(self): - from whylogs.api.usage_stats import emit_usage - - emit_usage("condition_validator") for cond_name in self.conditions.keys(): if cond_name not in self.failures: self.failures[cond_name] = 0 diff --git a/python/whylogs/viz/notebook_profile_viz.py b/python/whylogs/viz/notebook_profile_viz.py index 82ef1e60b0..e57936e912 100644 --- a/python/whylogs/viz/notebook_profile_viz.py +++ b/python/whylogs/viz/notebook_profile_viz.py @@ -7,7 +7,6 @@ from IPython.core.display import HTML # type: ignore import whylogs.viz.drift.column_drift_algorithms as column_drift_algorithms -from whylogs.api.usage_stats import emit_usage from whylogs.core.configs import SummaryConfig from whylogs.core.constraints import Constraints from whylogs.core.view.dataset_profile_view import DatasetProfileView @@ -24,7 +23,6 @@ ) logger = logging.getLogger(__name__) -emit_usage("visualizer") class NotebookProfileVisualizer: