From 5d1efae94cc8dffca527712d87c2c98909ad8a5a Mon Sep 17 00:00:00 2001 From: Frederic Lepied Date: Tue, 12 Sep 2023 20:12:26 +0200 Subject: [PATCH] various fixes - Save history parts under the history type to differentiate them from notes. - lower case the metadata keys from the Markdown header to make them easier to use and fixes a bug bewteen url and Url by the way. - save the `Date` metadata from the Markdown files as `created_at` metadata using a datetime object. - generalized datetime management. - more robust read and write of json files regarding date fields. - added a Referer header for history files. - fix similarity.py to accept multiple parameters. --- README.md | 2 +- integration-test.sh | 4 ++++ lib.py | 9 ++++++--- similarity.py | 12 ++++++++++-- transform_md.py | 41 ++++++++++++++++++++++++++++++++++------- transform_txt.py | 16 +++++++++++++--- 6 files changed, 68 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 4e8dd89..a8e6ecd 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ The system takes as input a directory where you store your markdown notes. For e ```mermaid graph TD -A[Markdown files from Obsidian]-->B[Text files from markdown and pointers]-->C[Text Chunks]-->D[Vector Database]-->E[Second Brain AI Agent] +A[Markdown files from your editor]-->B[Text files from markdown and pointers]-->C[Text Chunks]-->D[Vector Database]-->E[Second Brain AI Agent] ``` From a markdown file, [transform_md.py](transform_md.py) extracts the text from the markdown file, then from the links inside the markdown file it extracts pdf, url, youtube video and transforms them into text. There is some support to extract history data from the markdown files: if there is an `## History` section or the file name contains `History`, the file is split in multiple parts according to ` ` sections like `### 10 Sep 2023`. diff --git a/integration-test.sh b/integration-test.sh index 604ae50..8bf162d 100755 --- a/integration-test.sh +++ b/integration-test.sh @@ -50,6 +50,10 @@ if grep -q "I don't know." <<< "$RES"; then exit 1 fi +# wait a bit to be sure to have all the logs in different seconds +# for the vacuum cleaning process to work +sleep 2 + # test changing a document but not its content sudo journalctl -u sba-md --rotate sudo journalctl -u sba-md --vacuum-time=1s diff --git a/lib.py b/lib.py index ae219e6..5586191 100644 --- a/lib.py +++ b/lib.py @@ -198,7 +198,7 @@ class DateTimeEncoder(json.JSONEncoder): "Encode datetime objects to json" def default(self, o): - "Encode datetime objects to json" + "Encode datetime objects to json as isoformat strings" if isinstance(o, datetime.datetime): return o.isoformat() return super().default(o) @@ -208,9 +208,12 @@ def default(self, o): def datetime_decoder(dct): "Decode datetime objects from json" for key, val in dct.items(): - if key == "last_accessed_at": + if key.endswith("_at") or key.find("date") != -1: try: - dct[key] = datetime.datetime.fromisoformat(val) + if isinstance(val, str): + dct[key] = datetime.datetime.fromisoformat(val) + elif isinstance(val, float): + dct[key] = datetime.datetime.fromtimestamp(val) except ValueError: pass # Not a valid datetime string, leave as is return dct diff --git a/similarity.py b/similarity.py index 5f6bb45..b9a67ea 100755 --- a/similarity.py +++ b/similarity.py @@ -11,6 +11,15 @@ from lib import get_vectorstore +def split_filter(args): + "Split the filter arguments into an and query for chromadb" + if len(args) == 0: + return {} + if len(args) == 1: + return dict([args[0].split("=", 1)]) + return {"$and": [{arg.split("=", 1)[0]: arg.split("=", 1)[1]} for arg in args]} + + def main(query, **kwargs): "Entry point" vector_store = get_vectorstore() @@ -21,7 +30,6 @@ def main(query, **kwargs): if __name__ == "__main__": load_dotenv() - args = [arg.split("=", 1) for arg in sys.argv[2:]] - main(sys.argv[1], filter=dict(args)) + main(sys.argv[1], where=split_filter(sys.argv[2:])) # similarity.py ends here diff --git a/transform_md.py b/transform_md.py index 537d4ff..2763eda 100755 --- a/transform_md.py +++ b/transform_md.py @@ -43,6 +43,7 @@ def save_content(file_path, text, check_content=True, **metadata): "save the text and metatada into a json file" if check_content: try: + print(f"reading {file_path}", file=sys.stderr) with open(file_path, "r", encoding="utf-8") as in_f: data = json.load(in_f) if data["text"] == text: @@ -50,6 +51,8 @@ def save_content(file_path, text, check_content=True, **metadata): return False except FileNotFoundError: pass + except json.decoder.JSONDecodeError as exc: + print(f"invalid json file {file_path}: {exc}", file=sys.stderr) print(f"writing {file_path} metadata={metadata}", file=sys.stderr) data = {"text": text, "metadata": metadata} with open(file_path, "w", encoding="utf-8") as out_f: @@ -201,11 +204,21 @@ def get_metadata(content): for idx, line in enumerate(lines): header = line.split(":", 1) if len(header) == 2: - metadata[header[0].strip()] = header[1].strip() + metadata[header[0].strip().lower()] = header[1].strip() continue if line in ("---", "", "..."): continue break + if "date" in metadata: + # transform date to a date object and save is as created_at + # because langchain uses that field + try: + metadata["created_at"] = datetime.datetime.strptime( + metadata["date"], "%Y/%m/%d %H:%M" + ) + del metadata["date"] + except ValueError: + pass content = "\n".join(lines[idx:]) return metadata, content @@ -220,6 +233,18 @@ def remove_dash(content, level): return "\n".join(lines) +def get_date(date_str): + "Get the date from a string trying different formats: 01 Jan 2020 then 01 January 2020" + try: + return datetime.datetime.strptime(date_str, "%d %B %Y") + except ValueError: + try: + return datetime.datetime.strptime(date_str, "%d %b %Y") + except ValueError: + print(f"Unable to parse date {date_str}", file=sys.stderr) + return date_str + + DATE2_REGEXP = re.compile(r"^## (\d\d \w+ \d\d\d\d)", re.MULTILINE) DATE3_REGEXP = re.compile(r"^### (\d\d \w+ \d\d\d\d)", re.MULTILINE) @@ -247,13 +272,16 @@ def split_md_file(fname, md_dir): stat = os.stat(fname) os.utime(base_fname, (stat.st_atime, stat.st_mtime)) for idx in range(1, len(history), 2): - history_date = datetime.datetime.strptime(history[idx], "%d %b %Y") + history_date = get_date(history[idx]) + if isinstance(history_date, str): + continue if level == 1: date = history_date.strftime("%d") else: date = history_date.strftime("%Y%m%d") part_fname = os.path.join(md_dir, basename + date + ".md") with open(part_fname, "w", encoding="UTF-8") as fptr: + fptr.write(f"---\nReferer: {basename}\n---\n\n") fptr.write("# " + history[idx] + remove_dash(history[idx + 1], level)) mtime = (history_date + datetime.timedelta(hours=12)).timestamp() os.utime(part_fname, (mtime, mtime)) @@ -274,8 +302,10 @@ def write_output_file(md_file, out_dir, metadata): metadata, content = get_metadata(output.page_content) metadata["type"] = "notes" else: - content = output.page_content - metadata["last_accessed_at"] = (last_accessed_at,) + new_metadata, content = get_metadata(output.page_content) + metadata.update(new_metadata) + metadata["type"] = "history" + metadata["last_accessed_at"] = last_accessed_at if "url" not in metadata: metadata["url"] = f"file://{md_file}" print(f"saving {md_file=} with {metadata=}", file=sys.stderr) @@ -307,7 +337,6 @@ def process_md_file(fname, out_dir, checksum_store): return False basename = os.path.basename(fname[:-3]) oname = get_output_file_path(out_dir, basename) - stat = os.stat(fname) if is_same_time(fname, oname): print(f"skipping {fname} as there is no time change", file=sys.stderr) return False @@ -321,8 +350,6 @@ def process_md_file(fname, out_dir, checksum_store): print(f"skipping {fname} as content did not change", file=sys.stderr) return False print(f"processed '{fname}'", file=sys.stderr) - # set the timestamp to be the same - os.utime(oname, (stat.st_atime, stat.st_mtime)) return True diff --git a/transform_txt.py b/transform_txt.py index f2e23c5..ba42e5f 100755 --- a/transform_txt.py +++ b/transform_txt.py @@ -5,6 +5,7 @@ into vector embeddings and store the vectors in a vector database. """ +import datetime import json import os import sys @@ -54,12 +55,21 @@ def validate_and_extract_url(fname, basename, out_dir): if is_same_time(fname, oname): return False, None with open(fname, encoding="utf-8") as in_stream: - data = json.load(in_stream, object_hook=datetime_decoder) + try: + data = json.load(in_stream, object_hook=datetime_decoder) + except json.JSONDecodeError as exc: + print(f"Could not parse {fname}: {exc}", file=sys.stderr) + return False, None + if "metadata" not in data: + print(f"Could not find metadata in {fname}", file=sys.stderr) + return False, None metadata = data["metadata"] # convert the datetime to timestamp because chromadb does not # support datetime - if "last_accessed_at" in metadata: - metadata["last_accessed_at"] = metadata["last_accessed_at"].timestamp() + for key, val in metadata.items(): + # check if the value is a datetime + if isinstance(val, datetime.datetime): + metadata[key] = val.timestamp() return metadata, data["text"]