diff --git a/column2Vec/research/column2Vec_re.py b/column2Vec/research/column2Vec_re.py index 88224a4..5ecf8ea 100644 --- a/column2Vec/research/column2Vec_re.py +++ b/column2Vec/research/column2Vec_re.py @@ -48,8 +48,7 @@ ] MODEL = "paraphrase-multilingual-mpnet-base-v2" # 'bert-base-nli-mean-tokens' THIS_DIR = os.path.dirname(os.path.abspath(__file__)) -model = SentenceTransformer(MODEL, tokenizer_kwargs={ - 'clean_up_tokenization_spaces': True}) +model = SentenceTransformer(MODEL, tokenizer_kwargs={"clean_up_tokenization_spaces": True}) def count_embedding(column1: pd.Series, function, key: str) -> pd.Series: diff --git a/constants.py b/constants.py index 0bb6719..10dc328 100644 --- a/constants.py +++ b/constants.py @@ -51,9 +51,12 @@ class TrainedModel: """ configure() - __model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", tokenizer_kwargs={ - 'clean_up_tokenization_spaces': True, - }) + __model = SentenceTransformer( + "paraphrase-multilingual-mpnet-base-v2", + tokenizer_kwargs={ + "clean_up_tokenization_spaces": True, + }, + ) def set_module(self, model: SentenceTransformer): """ diff --git a/similarity/DataFrameMetadataCreator.py b/similarity/DataFrameMetadataCreator.py index 504fc6e..cc9d1d4 100644 --- a/similarity/DataFrameMetadataCreator.py +++ b/similarity/DataFrameMetadataCreator.py @@ -63,8 +63,7 @@ def __init__(self, dataframe: pd.DataFrame): """ self.dataframe = dataframe self.metadata = DataFrameMetadata() - self.model: Optional[SentenceTransformer] = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={ - 'clean_up_tokenization_spaces': True}) + self.model: Optional[SentenceTransformer] = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={"clean_up_tokenization_spaces": True}) self.metadata.size = dataframe.shape[0] self.metadata.column_names = list(dataframe.columns) self.metadata.column_names_clean = {i: re.sub("[^(0-9 |a-z).]", " ", i.lower()) for i in self.metadata.column_names} @@ -153,8 +152,7 @@ def __get_model(self) -> SentenceTransformer: :return: embedding model if exists or creates new one """ if not self.model: - self.model = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={ - 'clean_up_tokenization_spaces': True}) + self.model = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={"clean_up_tokenization_spaces": True}) return self.model # Setting Creator diff --git a/similarityRunner/UI/run_similarity.py b/similarityRunner/UI/run_similarity.py index 7b55040..ddd0c13 100644 --- a/similarityRunner/UI/run_similarity.py +++ b/similarityRunner/UI/run_similarity.py @@ -5,6 +5,7 @@ from models.user_models import SimilaritySettings, MetadataSettings, ComparatorType as ct, RunType import runner as r + def get_arg(index, message): try: return sys.argv[index] @@ -12,14 +13,16 @@ def get_arg(index, message): print(message) sys.exit(1) + if __name__ == "__main__": - directory = get_arg(1,"Add path to directory") - run_type = get_arg(2,"Add run type, all metadata, similarity") # all, metadata, similarity - comparator_type = get_arg(3,"Add comparator type: by_column, by_type ") # by_column, by_type - settings = SimilaritySettings(connector=FSConnectorSettings(file_type=(ft.CSV, ft.PARQUET), files_paths=[], directory_paths=[directory]), - metadata=MetadataSettings(all=True, kinds=True, types=True, embeddings=True), - run_type=RunType(run_type), - comparator_type=ct.BY_COLUMN if comparator_type == "by_column" else ct.BY_TYPE - ) + directory = get_arg(1, "Add path to directory") + run_type = get_arg(2, "Add run type, all metadata, similarity") # all, metadata, similarity + comparator_type = get_arg(3, "Add comparator type: by_column, by_type ") # by_column, by_type + settings = SimilaritySettings( + connector=FSConnectorSettings(file_type=(ft.CSV, ft.PARQUET), files_paths=[], directory_paths=[directory]), + metadata=MetadataSettings(all=True, kinds=True, types=True, embeddings=True), + run_type=RunType(run_type), + comparator_type=ct.BY_COLUMN if comparator_type == "by_column" else ct.BY_TYPE, + ) result = r.run(settings) print(result) diff --git a/similarityRunner/connectors/filesystem_connector.py b/similarityRunner/connectors/filesystem_connector.py index c521e0d..648e4be 100644 --- a/similarityRunner/connectors/filesystem_connector.py +++ b/similarityRunner/connectors/filesystem_connector.py @@ -1,6 +1,7 @@ """ This file contains filesystem connector implementation """ + import os from functionsRunner import load_files_from_list diff --git a/similarityRunner/formators/jason_formater.py b/similarityRunner/formators/jason_formater.py index 60a9d40..e4a54c5 100644 --- a/similarityRunner/formators/jason_formater.py +++ b/similarityRunner/formators/jason_formater.py @@ -6,5 +6,5 @@ class JsonFormater(OutputFormaterInterface): def format(self, data: dict) -> json: - jsondata = json.dumps(data, indent = 4) + jsondata = json.dumps(data, indent=4) return jsondata diff --git a/similarityRunner/functionsRunner.py b/similarityRunner/functionsRunner.py index 911cc60..c6bc1a8 100644 --- a/similarityRunner/functionsRunner.py +++ b/similarityRunner/functionsRunner.py @@ -4,9 +4,7 @@ def load_files_from_list(folder: list[str], file_types: tuple = (FileType.CSV,)) -> tuple[list[pd.DataFrame], list[str]]: - """ - - """ + """ """ data = [] names = [] for file in folder: @@ -18,6 +16,7 @@ def load_files_from_list(folder: list[str], file_types: tuple = (FileType.CSV,)) names.append(file.replace(".parquet", "")) return data, names + def csv_to_parquet(file: str): df = pd.read_csv(file) df.to_parquet(file.replace(".csv", ".parquet")) diff --git a/similarityRunner/interfaces/ConnectorInterface.py b/similarityRunner/interfaces/ConnectorInterface.py index 906e0a5..a9aa0bf 100644 --- a/similarityRunner/interfaces/ConnectorInterface.py +++ b/similarityRunner/interfaces/ConnectorInterface.py @@ -35,4 +35,4 @@ def get_data(self, settings: ConnectorSettings) -> Output: @abc.abstractmethod def close(self): """Close the connection""" - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/similarityRunner/interfaces/OutputFormaterInterface.py b/similarityRunner/interfaces/OutputFormaterInterface.py index 3631ce4..344753d 100644 --- a/similarityRunner/interfaces/OutputFormaterInterface.py +++ b/similarityRunner/interfaces/OutputFormaterInterface.py @@ -1,8 +1,10 @@ """ """ + import abc + class OutputFormaterInterface(metaclass=abc.ABCMeta): """ OutputFormaterInterface class is an abstract interface that defines @@ -11,4 +13,4 @@ class OutputFormaterInterface(metaclass=abc.ABCMeta): @abc.abstractmethod def format(self, data: dict): - pass \ No newline at end of file + pass diff --git a/similarityRunner/models/connector_models.py b/similarityRunner/models/connector_models.py index 07560ae..50bc825 100644 --- a/similarityRunner/models/connector_models.py +++ b/similarityRunner/models/connector_models.py @@ -3,6 +3,7 @@ - the base class for connector settings and derived classes. - the base class for connector output and derived classes. """ + from enum import Enum import pandas as pd @@ -10,6 +11,7 @@ Output = tuple[list[pd.DataFrame], list[str]] + class FileType(str, Enum): CSV = "csv" PARQUET = "parquet" @@ -20,7 +22,8 @@ class ConnectorSettings(BaseModel): ConnectorSettings class is a base class for connector settings. """ - file_type: tuple[FileType, ...] # csv, parquet, etc., tuple for immutability + file_type: tuple[FileType, ...] # csv, parquet, etc., tuple for immutability + class Config: # arbitrary_types_allowed is set to True to allow tuple FileType arbitrary_types_allowed = True @@ -30,6 +33,7 @@ class ConnectorOutput(BaseModel): """ ConnectorOutput class is a base class for connector output. """ + names: list[str] tables: list[pd.DataFrame] @@ -42,11 +46,14 @@ class FSConnectorSettings(ConnectorSettings): """ FSConnectorSettings class is a derived class for filesystem connector settings. """ + files_paths: list[str] directory_paths: list[str] + class S3ConnectorSettings(ConnectorSettings): """ S3ConnectorSettings class is a derived class for S3 connector settings. """ + pass diff --git a/similarityRunner/models/user_models.py b/similarityRunner/models/user_models.py index f32db49..bc3c04f 100644 --- a/similarityRunner/models/user_models.py +++ b/similarityRunner/models/user_models.py @@ -1,6 +1,7 @@ """ This module contains the user models """ + from enum import Enum from pydantic import BaseModel @@ -23,28 +24,34 @@ class Config: # arbitrary_types_allowed is set to True to allow list and dictionary arbitrary_types_allowed = True + class MetadataSettings(BaseModel): """ MetadataSettings class is a base class for metadata settings. """ + all: bool kinds: bool types: bool embeddings: bool + class RunType(str, Enum): ALL = "all" METADATA = "metadata" SIMILARITY = "similarity" + class ComparatorType(Enum): BY_COLUMN = ComparatorByColumn() BY_TYPE = Comparator() + class SimilaritySettings(BaseModel): """ SimilaritySettings class is a base class for similarity settings. """ + connector: ConnectorSettings metadata: MetadataSettings run_type: RunType diff --git a/similarityRunner/runner.py b/similarityRunner/runner.py index c92d9ce..33c54f1 100644 --- a/similarityRunner/runner.py +++ b/similarityRunner/runner.py @@ -1,6 +1,7 @@ """ This """ + import time from Comparator import Comparator, KindComparator, ColumnExactNamesComparator as ExactNames @@ -22,13 +23,9 @@ def create_metadata(settings: SimilaritySettings, data: Output) -> dict[str, Dat df_metadata = {} if settings.metadata.all: for df, name in zip(dataframes, names): - df_metadata[name] = (DataFrameMetadataCreator(df) - .create_column_embeddings() - .compute_advanced_structural_types() - .compute_column_kind() - .get_metadata()) + df_metadata[name] = DataFrameMetadataCreator(df).create_column_embeddings().compute_advanced_structural_types().compute_column_kind().get_metadata() else: - ... # todo after #35 + ... # todo after #35 # todo save metadata after #35 return df_metadata @@ -43,21 +40,20 @@ def __get_comparator(settings: SimilaritySettings): return comp.add_comparator_type(ColumnKindComparator()).add_comparator_type(ColumnExactNamesComparator()) # todo add by settings #35 else: - comp = Comparator() # todo add by settings #35 + comp = Comparator() # todo add by settings #35 return comp.add_comparator_type(KindComparator()).add_comparator_type(ExactNames()) + def compute_similarity(settings: SimilaritySettings, data: dict[str, DataFrameMetadata]): """ Compute similarity between tables """ comparator = __get_comparator(settings) names = list(data.keys()) - similarity = { - name: {name2: comparator.compare(data[name], data[name2]) for name2 in names} - for name in names - } + similarity = {name: {name2: comparator.compare(data[name], data[name2]) for name2 in names} for name in names} return similarity + def run(settings: SimilaritySettings): """ Run the similarity pipeline @@ -78,4 +74,4 @@ def run(settings: SimilaritySettings): elif settings.run_type == "metadata": create_metadata(settings, data) elif settings.run_type == "similarity": - print("Similarity") # todo after #35 + print("Similarity") # todo after #35 diff --git a/test/test_runner_connectors.py b/test/test_runner_connectors.py index 47e8784..030f03d 100644 --- a/test/test_runner_connectors.py +++ b/test/test_runner_connectors.py @@ -1,6 +1,6 @@ import unittest -from similarityRunner.connectors.filesystem_connector import FilesystemConnector +from connectors.filesystem_connector import FilesystemConnector from similarityRunner.models.connector_models import FSConnectorSettings