Skip to content

Commit

Permalink
Merge pull request #3 from ling6614/feature/add-file-processing-tests
Browse files Browse the repository at this point in the history
Feature/add file processing tests
  • Loading branch information
cubxxw authored Jun 8, 2024
2 parents 81b6400 + c0d4047 commit ce5ec74
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 2 deletions.
38 changes: 38 additions & 0 deletions ai_commons/file_processing/excel_parser.py
Original file line number Diff line number Diff line change
@@ -1 +1,39 @@
# Excel parser module
import pandas as pd
import json
import os

class ExcelConverter:
def __init__(self, file_path):
self.file_path = file_path
self.data_frame = pd.read_excel(file_path, engine='openpyxl')

def to_json(self, chunk_size=5):
json_list = []
for index, row in self.data_frame.iterrows():
row_dict = row.to_dict()
for key, value in row_dict.items():
if isinstance(value, pd.Timestamp):
row_dict[key] = value.strftime('%Y-%m-%d %H:%M:%S')
json_list.append(row_dict)

chunked_list = [json_list[i:i + chunk_size] for i in range(0, len(json_list), chunk_size)]
json_string_list = [json.dumps(chunk, ensure_ascii=False) for chunk in chunked_list]
return json_string_list

# TODO: Implement to_yaml method
# def to_yaml(self):
# data = self.data_frame.to_dict(orient='records')
# return yaml.dump(data, allow_unicode=True)

# TODO: Implement to_xml method
# def to_xml(self):
# root = ET.Element("root")
# for _, row in self.data_frame.iterrows():
# item = ET.SubElement(root, "item")
# for key, value in row.items():
# if isinstance(value, pd.Timestamp):
# value = value.strftime('%Y-%m-%d %H:%M:%S')
# child = ET.SubElement(item, key)
# child.text = str(value)
# return ET.tostring(root, encoding='unicode')
18 changes: 17 additions & 1 deletion ai_commons/file_processing/json_validator.py
Original file line number Diff line number Diff line change
@@ -1 +1,17 @@
# JSON validator module
# Json validator module

import json

def remove_keys_from_json(json_string, keys_to_remove):
# 解析JSON字符串为列表
data_list = json.loads(json_string)

# 遍历列表中的每个字典
for data in data_list:
for key_list in keys_to_remove:
for key in key_list:
if key in data:
del data[key]

# 将列表转换回JSON字符串
return json.dumps(data_list, ensure_ascii=False)
44 changes: 43 additions & 1 deletion tests/test_file_processing.py
Original file line number Diff line number Diff line change
@@ -1 +1,43 @@
# Test file processing
import unittest
import json
import os
import pandas as pd
from ai_commons.file_processing.excel_parser import ExcelConverter
from ai_commons.file_processing.json_validator import remove_keys_from_json

class TestExcelConverter(unittest.TestCase):

def setUp(self):
# 创建一个示例Excel文件
self.test_file = 'test.xlsx'
df = pd.DataFrame({
'Timestamp': [pd.Timestamp('2024-04-07 20:50:30'), pd.Timestamp('2024-05-08 15:30:45')],
'User ID': ['U12345', 'U67890'],
'Gift': ['Flower', 'Chocolate']
})
df.to_excel(self.test_file, index=False)

def tearDown(self):
# 删除示例Excel文件
if os.path.exists(self.test_file):
os.remove(self.test_file)

def test_to_json(self):
converter = ExcelConverter(self.test_file)
json_output = converter.to_json(chunk_size=2)
expected_output = [
'[{"Timestamp": "2024-04-07 20:50:30", "User ID": "U12345", "Gift": "Flower"}, {"Timestamp": "2024-05-08 15:30:45", "User ID": "U67890", "Gift": "Chocolate"}]'
]
self.assertEqual(json_output, expected_output)

class TestJsonValidator(unittest.TestCase):

def test_remove_keys_from_json(self):
json_string = '[{"Timestamp": "2024-04-07 20:50:30", "User ID": "U12345", "Gift": "Flower"}, {"Timestamp": "2024-05-08 15:30:45", "User ID": "U67890", "Gift": "Chocolate"}]'
keys_to_remove = [["Timestamp"], ["User ID"]]
modified_json_string = remove_keys_from_json(json_string, keys_to_remove)
expected_output = '[{"Gift": "Flower"}, {"Gift": "Chocolate"}]'
self.assertEqual(modified_json_string, expected_output)

if __name__ == '__main__':
unittest.main()

0 comments on commit ce5ec74

Please sign in to comment.