Skip to content

Commit b271786

Browse files
#23 add file system connector
1 parent b88e01d commit b271786

File tree

5 files changed

+70
-1
lines changed

5 files changed

+70
-1
lines changed

similarityRunner/connectors/__init__.py

Whitespace-only changes.
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""
2+
This file contains filesystem connector implementation
3+
"""
4+
import os
5+
6+
from functionsRunner import load_files_from_list
7+
from interfaces.ConnectorInterface import ConnectorInterface
8+
from models.connector_models import ConnectorSettings, Output, ConnectorOutput, FSConnectorSettings
9+
10+
11+
class FilesystemConnector(ConnectorInterface):
12+
def __init__(self, config):
13+
self.config = config
14+
15+
def _connect_and_load_data_source(self, settings: FSConnectorSettings) -> ConnectorOutput:
16+
file_list = settings.files_paths
17+
for folder in settings.directory_paths:
18+
file_list = file_list + [folder + "/" + s for s in os.listdir(folder)]
19+
20+
names, tables = load_files_from_list(os.listdir(file_list), settings.file_type)
21+
return ConnectorOutput(names=names, tables=tables)
22+
23+
def _format_data(self, data: ConnectorOutput) -> Output:
24+
pass
25+
26+
def close(self):
27+
pass
28+

similarityRunner/functionsRunner.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import pandas as pd
2+
3+
from models.connector_models import FileType
4+
5+
6+
def load_files_from_list(folder: list[str], file_type: tuple[FileType] = FileType.CSV) -> tuple[list[pd.DataFrame], list[str]]:
7+
"""
8+
it loads cvs files from folder and returns list of loaded dataframe and list of names
9+
:param folder: from which we load the files
10+
:param file_type: type of file, csv, parquet, etc.
11+
:return: two lists
12+
"""
13+
data = []
14+
names = []
15+
for file in folder:
16+
if FileType.CSV in file_type and file.endswith(".csv"):
17+
data.append(pd.read_csv(file))
18+
names.append(file.replace(".csv", ""))
19+
if FileType.PARQUET in file_type and file.endswith(".parquet"):
20+
data.append(pd.read_parquet(file))
21+
names.append(file.replace(".parquet", ""))
22+
return data, names

similarityRunner/interfaces/ConnectorInterface.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,8 @@ def get_data(self, settings: ConnectorSettings) -> Output:
3131
:return: data"""
3232
data = self._connect_and_load_data_source(settings)
3333
return self._format_data(data)
34+
35+
@abc.abstractmethod
36+
def close(self):
37+
"""Close the connection"""
38+
raise NotImplementedError

similarityRunner/models/connector_models.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,38 @@
33
- the base class for connector settings and derived classes.
44
- the base class for connector output and derived classes.
55
"""
6+
from enum import Enum
67

78
import pandas as pd
89
from pydantic import BaseModel
910

1011
Output = pd.DataFrame
1112

13+
class FileType(Enum):
14+
CSV = "csv"
15+
PARQUET = "parquet"
16+
1217

1318
class ConnectorSettings(BaseModel):
1419
"""
1520
ConnectorSettings class is a base class for connector settings.
1621
"""
1722

1823
# here will be common fields for all connectors
24+
file_type: tuple[FileType] # csv, parquet, etc., tuple for immutability
1925

2026

2127
class ConnectorOutput(BaseModel):
2228
"""
2329
ConnectorOutput class is a base class for connector output.
2430
"""
25-
31+
names: list[str]
32+
tables: tuple[list[pd.DataFrame]]
2633
# here will be common fields for all connectors
34+
35+
class FSConnectorSettings(ConnectorSettings):
36+
"""
37+
FSConnectorSettings class is a derived class for filesystem connector settings.
38+
"""
39+
files_paths: list[str]
40+
directory_paths: list[str]

0 commit comments

Comments
 (0)