diff --git a/ai_commons/file_processing/excel_parser.py b/ai_commons/file_processing/excel_parser.py index b64d266..d0c8e33 100644 --- a/ai_commons/file_processing/excel_parser.py +++ b/ai_commons/file_processing/excel_parser.py @@ -1 +1,39 @@ # Excel parser module +import pandas as pd +import json +import os + +class ExcelConverter: + def __init__(self, file_path): + self.file_path = file_path + self.data_frame = pd.read_excel(file_path, engine='openpyxl') + + def to_json(self, chunk_size=5): + json_list = [] + for index, row in self.data_frame.iterrows(): + row_dict = row.to_dict() + for key, value in row_dict.items(): + if isinstance(value, pd.Timestamp): + row_dict[key] = value.strftime('%Y-%m-%d %H:%M:%S') + json_list.append(row_dict) + + chunked_list = [json_list[i:i + chunk_size] for i in range(0, len(json_list), chunk_size)] + json_string_list = [json.dumps(chunk, ensure_ascii=False) for chunk in chunked_list] + return json_string_list + + # TODO: Implement to_yaml method + # def to_yaml(self): + # data = self.data_frame.to_dict(orient='records') + # return yaml.dump(data, allow_unicode=True) + + # TODO: Implement to_xml method + # def to_xml(self): + # root = ET.Element("root") + # for _, row in self.data_frame.iterrows(): + # item = ET.SubElement(root, "item") + # for key, value in row.items(): + # if isinstance(value, pd.Timestamp): + # value = value.strftime('%Y-%m-%d %H:%M:%S') + # child = ET.SubElement(item, key) + # child.text = str(value) + # return ET.tostring(root, encoding='unicode') diff --git a/ai_commons/file_processing/json_validator.py b/ai_commons/file_processing/json_validator.py index 13b0a84..39cd288 100644 --- a/ai_commons/file_processing/json_validator.py +++ b/ai_commons/file_processing/json_validator.py @@ -1 +1,17 @@ -# JSON validator module +# Json validator module + +import json + +def remove_keys_from_json(json_string, keys_to_remove): + # 解析JSON字符串为列表 + data_list = json.loads(json_string) + + # 遍历列表中的每个字典 + for data in data_list: + for key_list in keys_to_remove: + for key in key_list: + if key in data: + del data[key] + + # 将列表转换回JSON字符串 + return json.dumps(data_list, ensure_ascii=False) \ No newline at end of file diff --git a/tests/test_file_processing.py b/tests/test_file_processing.py index 442b199..2b8f63e 100644 --- a/tests/test_file_processing.py +++ b/tests/test_file_processing.py @@ -1 +1,43 @@ -# Test file processing +import unittest +import json +import os +import pandas as pd +from ai_commons.file_processing.excel_parser import ExcelConverter +from ai_commons.file_processing.json_validator import remove_keys_from_json + +class TestExcelConverter(unittest.TestCase): + + def setUp(self): + # 创建一个示例Excel文件 + self.test_file = 'test.xlsx' + df = pd.DataFrame({ + 'Timestamp': [pd.Timestamp('2024-04-07 20:50:30'), pd.Timestamp('2024-05-08 15:30:45')], + 'User ID': ['U12345', 'U67890'], + 'Gift': ['Flower', 'Chocolate'] + }) + df.to_excel(self.test_file, index=False) + + def tearDown(self): + # 删除示例Excel文件 + if os.path.exists(self.test_file): + os.remove(self.test_file) + + def test_to_json(self): + converter = ExcelConverter(self.test_file) + json_output = converter.to_json(chunk_size=2) + expected_output = [ + '[{"Timestamp": "2024-04-07 20:50:30", "User ID": "U12345", "Gift": "Flower"}, {"Timestamp": "2024-05-08 15:30:45", "User ID": "U67890", "Gift": "Chocolate"}]' + ] + self.assertEqual(json_output, expected_output) + +class TestJsonValidator(unittest.TestCase): + + def test_remove_keys_from_json(self): + json_string = '[{"Timestamp": "2024-04-07 20:50:30", "User ID": "U12345", "Gift": "Flower"}, {"Timestamp": "2024-05-08 15:30:45", "User ID": "U67890", "Gift": "Chocolate"}]' + keys_to_remove = [["Timestamp"], ["User ID"]] + modified_json_string = remove_keys_from_json(json_string, keys_to_remove) + expected_output = '[{"Gift": "Flower"}, {"Gift": "Chocolate"}]' + self.assertEqual(modified_json_string, expected_output) + +if __name__ == '__main__': + unittest.main()