Skip to content

Commit

Permalink
Format files with black
Browse files Browse the repository at this point in the history
  • Loading branch information
OlivieFranklova committed Oct 2, 2024
1 parent 6ea0607 commit d087b50
Show file tree
Hide file tree
Showing 13 changed files with 52 additions and 37 deletions.
3 changes: 1 addition & 2 deletions column2Vec/research/column2Vec_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@
]
MODEL = "paraphrase-multilingual-mpnet-base-v2" # 'bert-base-nli-mean-tokens'
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
model = SentenceTransformer(MODEL, tokenizer_kwargs={
'clean_up_tokenization_spaces': True})
model = SentenceTransformer(MODEL, tokenizer_kwargs={"clean_up_tokenization_spaces": True})


def count_embedding(column1: pd.Series, function, key: str) -> pd.Series:
Expand Down
9 changes: 6 additions & 3 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,12 @@ class TrainedModel:
"""

configure()
__model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", tokenizer_kwargs={
'clean_up_tokenization_spaces': True,
})
__model = SentenceTransformer(
"paraphrase-multilingual-mpnet-base-v2",
tokenizer_kwargs={
"clean_up_tokenization_spaces": True,
},
)

def set_module(self, model: SentenceTransformer):
"""
Expand Down
6 changes: 2 additions & 4 deletions similarity/DataFrameMetadataCreator.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@ def __init__(self, dataframe: pd.DataFrame):
"""
self.dataframe = dataframe
self.metadata = DataFrameMetadata()
self.model: Optional[SentenceTransformer] = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={
'clean_up_tokenization_spaces': True})
self.model: Optional[SentenceTransformer] = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={"clean_up_tokenization_spaces": True})
self.metadata.size = dataframe.shape[0]
self.metadata.column_names = list(dataframe.columns)
self.metadata.column_names_clean = {i: re.sub("[^(0-9 |a-z).]", " ", i.lower()) for i in self.metadata.column_names}
Expand Down Expand Up @@ -153,8 +152,7 @@ def __get_model(self) -> SentenceTransformer:
:return: embedding model if exists or creates new one
"""
if not self.model:
self.model = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={
'clean_up_tokenization_spaces': True})
self.model = SentenceTransformer("bert-base-nli-mean-tokens", tokenizer_kwargs={"clean_up_tokenization_spaces": True})
return self.model

# Setting Creator
Expand Down
19 changes: 11 additions & 8 deletions similarityRunner/UI/run_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,24 @@
from models.user_models import SimilaritySettings, MetadataSettings, ComparatorType as ct, RunType
import runner as r


def get_arg(index, message):
try:
return sys.argv[index]
except IndexError:
print(message)
sys.exit(1)


if __name__ == "__main__":
directory = get_arg(1,"Add path to directory")
run_type = get_arg(2,"Add run type, all metadata, similarity") # all, metadata, similarity
comparator_type = get_arg(3,"Add comparator type: by_column, by_type ") # by_column, by_type
settings = SimilaritySettings(connector=FSConnectorSettings(file_type=(ft.CSV, ft.PARQUET), files_paths=[], directory_paths=[directory]),
metadata=MetadataSettings(all=True, kinds=True, types=True, embeddings=True),
run_type=RunType(run_type),
comparator_type=ct.BY_COLUMN if comparator_type == "by_column" else ct.BY_TYPE
)
directory = get_arg(1, "Add path to directory")
run_type = get_arg(2, "Add run type, all metadata, similarity") # all, metadata, similarity
comparator_type = get_arg(3, "Add comparator type: by_column, by_type ") # by_column, by_type
settings = SimilaritySettings(
connector=FSConnectorSettings(file_type=(ft.CSV, ft.PARQUET), files_paths=[], directory_paths=[directory]),
metadata=MetadataSettings(all=True, kinds=True, types=True, embeddings=True),
run_type=RunType(run_type),
comparator_type=ct.BY_COLUMN if comparator_type == "by_column" else ct.BY_TYPE,
)
result = r.run(settings)
print(result)
1 change: 1 addition & 0 deletions similarityRunner/connectors/filesystem_connector.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
This file contains filesystem connector implementation
"""

import os

from functionsRunner import load_files_from_list
Expand Down
2 changes: 1 addition & 1 deletion similarityRunner/formators/jason_formater.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class JsonFormater(OutputFormaterInterface):

def format(self, data: dict) -> json:
jsondata = json.dumps(data, indent = 4)
jsondata = json.dumps(data, indent=4)
return jsondata
5 changes: 2 additions & 3 deletions similarityRunner/functionsRunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@


def load_files_from_list(folder: list[str], file_types: tuple = (FileType.CSV,)) -> tuple[list[pd.DataFrame], list[str]]:
"""
"""
""" """
data = []
names = []
for file in folder:
Expand All @@ -18,6 +16,7 @@ def load_files_from_list(folder: list[str], file_types: tuple = (FileType.CSV,))
names.append(file.replace(".parquet", ""))
return data, names


def csv_to_parquet(file: str):
df = pd.read_csv(file)
df.to_parquet(file.replace(".csv", ".parquet"))
Expand Down
2 changes: 1 addition & 1 deletion similarityRunner/interfaces/ConnectorInterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ def get_data(self, settings: ConnectorSettings) -> Output:
@abc.abstractmethod
def close(self):
"""Close the connection"""
raise NotImplementedError
raise NotImplementedError
4 changes: 3 additions & 1 deletion similarityRunner/interfaces/OutputFormaterInterface.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""
"""

import abc


class OutputFormaterInterface(metaclass=abc.ABCMeta):
"""
OutputFormaterInterface class is an abstract interface that defines
Expand All @@ -11,4 +13,4 @@ class OutputFormaterInterface(metaclass=abc.ABCMeta):

@abc.abstractmethod
def format(self, data: dict):
pass
pass
9 changes: 8 additions & 1 deletion similarityRunner/models/connector_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
- the base class for connector settings and derived classes.
- the base class for connector output and derived classes.
"""

from enum import Enum

import pandas as pd
from pydantic import BaseModel

Output = tuple[list[pd.DataFrame], list[str]]


class FileType(str, Enum):
CSV = "csv"
PARQUET = "parquet"
Expand All @@ -20,7 +22,8 @@ class ConnectorSettings(BaseModel):
ConnectorSettings class is a base class for connector settings.
"""

file_type: tuple[FileType, ...] # csv, parquet, etc., tuple for immutability
file_type: tuple[FileType, ...] # csv, parquet, etc., tuple for immutability

class Config:
# arbitrary_types_allowed is set to True to allow tuple FileType
arbitrary_types_allowed = True
Expand All @@ -30,6 +33,7 @@ class ConnectorOutput(BaseModel):
"""
ConnectorOutput class is a base class for connector output.
"""

names: list[str]
tables: list[pd.DataFrame]

Expand All @@ -42,11 +46,14 @@ class FSConnectorSettings(ConnectorSettings):
"""
FSConnectorSettings class is a derived class for filesystem connector settings.
"""

files_paths: list[str]
directory_paths: list[str]


class S3ConnectorSettings(ConnectorSettings):
"""
S3ConnectorSettings class is a derived class for S3 connector settings.
"""

pass
7 changes: 7 additions & 0 deletions similarityRunner/models/user_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
This module contains the user models
"""

from enum import Enum

from pydantic import BaseModel
Expand All @@ -23,28 +24,34 @@ class Config:
# arbitrary_types_allowed is set to True to allow list and dictionary
arbitrary_types_allowed = True


class MetadataSettings(BaseModel):
"""
MetadataSettings class is a base class for metadata settings.
"""

all: bool
kinds: bool
types: bool
embeddings: bool


class RunType(str, Enum):
ALL = "all"
METADATA = "metadata"
SIMILARITY = "similarity"


class ComparatorType(Enum):
BY_COLUMN = ComparatorByColumn()
BY_TYPE = Comparator()


class SimilaritySettings(BaseModel):
"""
SimilaritySettings class is a base class for similarity settings.
"""

connector: ConnectorSettings
metadata: MetadataSettings
run_type: RunType
Expand Down
20 changes: 8 additions & 12 deletions similarityRunner/runner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
This
"""

import time

from Comparator import Comparator, KindComparator, ColumnExactNamesComparator as ExactNames
Expand All @@ -22,13 +23,9 @@ def create_metadata(settings: SimilaritySettings, data: Output) -> dict[str, Dat
df_metadata = {}
if settings.metadata.all:
for df, name in zip(dataframes, names):
df_metadata[name] = (DataFrameMetadataCreator(df)
.create_column_embeddings()
.compute_advanced_structural_types()
.compute_column_kind()
.get_metadata())
df_metadata[name] = DataFrameMetadataCreator(df).create_column_embeddings().compute_advanced_structural_types().compute_column_kind().get_metadata()
else:
... # todo after #35
... # todo after #35

# todo save metadata after #35
return df_metadata
Expand All @@ -43,21 +40,20 @@ def __get_comparator(settings: SimilaritySettings):
return comp.add_comparator_type(ColumnKindComparator()).add_comparator_type(ColumnExactNamesComparator())
# todo add by settings #35
else:
comp = Comparator() # todo add by settings #35
comp = Comparator() # todo add by settings #35
return comp.add_comparator_type(KindComparator()).add_comparator_type(ExactNames())


def compute_similarity(settings: SimilaritySettings, data: dict[str, DataFrameMetadata]):
"""
Compute similarity between tables
"""
comparator = __get_comparator(settings)
names = list(data.keys())
similarity = {
name: {name2: comparator.compare(data[name], data[name2]) for name2 in names}
for name in names
}
similarity = {name: {name2: comparator.compare(data[name], data[name2]) for name2 in names} for name in names}
return similarity


def run(settings: SimilaritySettings):
"""
Run the similarity pipeline
Expand All @@ -78,4 +74,4 @@ def run(settings: SimilaritySettings):
elif settings.run_type == "metadata":
create_metadata(settings, data)
elif settings.run_type == "similarity":
print("Similarity") # todo after #35
print("Similarity") # todo after #35
2 changes: 1 addition & 1 deletion test/test_runner_connectors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest

from similarityRunner.connectors.filesystem_connector import FilesystemConnector
from connectors.filesystem_connector import FilesystemConnector
from similarityRunner.models.connector_models import FSConnectorSettings


Expand Down

0 comments on commit d087b50

Please sign in to comment.