From 5d1efae94cc8dffca527712d87c2c98909ad8a5a Mon Sep 17 00:00:00 2001
From: Frederic Lepied <flepied@gmail.com>
Date: Tue, 12 Sep 2023 20:12:26 +0200
Subject: [PATCH] various fixes

- Save history parts under the history type to differentiate them from notes.
- lower case the metadata keys from the Markdown header to make them easier to use and fixes a bug bewteen url and Url by the way.
- save the `Date` metadata from the Markdown files as `created_at` metadata using a datetime object.
- generalized datetime management.
- more robust read and write of json files regarding date fields.
- added a Referer header for history files.
- fix similarity.py to accept multiple parameters.
---
 README.md           |  2 +-
 integration-test.sh |  4 ++++
 lib.py              |  9 ++++++---
 similarity.py       | 12 ++++++++++--
 transform_md.py     | 41 ++++++++++++++++++++++++++++++++++-------
 transform_txt.py    | 16 +++++++++++++---
 6 files changed, 68 insertions(+), 16 deletions(-)
diff --git a/README.md b/README.md
index 4e8dd89..a8e6ecd 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ The system takes as input a directory where you store your markdown notes. For e
 
 ```mermaid
 graph TD
-A[Markdown files from Obsidian]-->B[Text files from markdown and pointers]-->C[Text Chunks]-->D[Vector Database]-->E[Second Brain AI Agent]
+A[Markdown files from your editor]-->B[Text files from markdown and pointers]-->C[Text Chunks]-->D[Vector Database]-->E[Second Brain AI Agent]
 ```
 
 From a markdown file, [transform_md.py](transform_md.py) extracts the text from the markdown file, then from the links inside the markdown file it extracts pdf, url, youtube video and transforms them into text. There is some support to extract history data from the markdown files: if there is an `## History` section or the file name contains `History`, the file is split in multiple parts according to `<day> <month> <year>` sections like `### 10 Sep 2023`.
diff --git a/integration-test.sh b/integration-test.sh
index 604ae50..8bf162d 100755
--- a/integration-test.sh
+++ b/integration-test.sh
@@ -50,6 +50,10 @@ if grep -q "I don't know." <<< "$RES"; then
     exit 1
 fi
 
+# wait a bit to be sure to have all the logs in different seconds
+# for the vacuum cleaning process to work
+sleep 2
+
 # test changing a document but not its content
 sudo journalctl -u sba-md --rotate
 sudo journalctl -u sba-md --vacuum-time=1s
diff --git a/lib.py b/lib.py
index ae219e6..5586191 100644
--- a/lib.py
+++ b/lib.py
@@ -198,7 +198,7 @@ class DateTimeEncoder(json.JSONEncoder):
     "Encode datetime objects to json"
 
     def default(self, o):
-        "Encode datetime objects to json"
+        "Encode datetime objects to json as isoformat strings"
         if isinstance(o, datetime.datetime):
             return o.isoformat()
         return super().default(o)
@@ -208,9 +208,12 @@ def default(self, o):
 def datetime_decoder(dct):
     "Decode datetime objects from json"
     for key, val in dct.items():
-        if key == "last_accessed_at":
+        if key.endswith("_at") or key.find("date") != -1:
             try:
-                dct[key] = datetime.datetime.fromisoformat(val)
+                if isinstance(val, str):
+                    dct[key] = datetime.datetime.fromisoformat(val)
+                elif isinstance(val, float):
+                    dct[key] = datetime.datetime.fromtimestamp(val)
             except ValueError:
                 pass  # Not a valid datetime string, leave as is
     return dct
diff --git a/similarity.py b/similarity.py
index 5f6bb45..b9a67ea 100755
--- a/similarity.py
+++ b/similarity.py
@@ -11,6 +11,15 @@
 from lib import get_vectorstore
 
 
+def split_filter(args):
+    "Split the filter arguments into an and query for chromadb"
+    if len(args) == 0:
+        return {}
+    if len(args) == 1:
+        return dict([args[0].split("=", 1)])
+    return {"$and": [{arg.split("=", 1)[0]: arg.split("=", 1)[1]} for arg in args]}
+
+
 def main(query, **kwargs):
     "Entry point"
     vector_store = get_vectorstore()
@@ -21,7 +30,6 @@ def main(query, **kwargs):
 
 if __name__ == "__main__":
     load_dotenv()
-    args = [arg.split("=", 1) for arg in sys.argv[2:]]
-    main(sys.argv[1], filter=dict(args))
+    main(sys.argv[1], where=split_filter(sys.argv[2:]))
 
 # similarity.py ends here
diff --git a/transform_md.py b/transform_md.py
index 537d4ff..2763eda 100755
--- a/transform_md.py
+++ b/transform_md.py
@@ -43,6 +43,7 @@ def save_content(file_path, text, check_content=True, **metadata):
     "save the text and metatada into a json file"
     if check_content:
         try:
+            print(f"reading {file_path}", file=sys.stderr)
             with open(file_path, "r", encoding="utf-8") as in_f:
                 data = json.load(in_f)
             if data["text"] == text:
@@ -50,6 +51,8 @@ def save_content(file_path, text, check_content=True, **metadata):
                 return False
         except FileNotFoundError:
             pass
+        except json.decoder.JSONDecodeError as exc:
+            print(f"invalid json file {file_path}: {exc}", file=sys.stderr)
     print(f"writing {file_path} metadata={metadata}", file=sys.stderr)
     data = {"text": text, "metadata": metadata}
     with open(file_path, "w", encoding="utf-8") as out_f:
@@ -201,11 +204,21 @@ def get_metadata(content):
     for idx, line in enumerate(lines):
         header = line.split(":", 1)
         if len(header) == 2:
-            metadata[header[0].strip()] = header[1].strip()
+            metadata[header[0].strip().lower()] = header[1].strip()
             continue
         if line in ("---", "", "..."):
             continue
         break
+    if "date" in metadata:
+        # transform date to a date object and save is as created_at
+        # because langchain uses that field
+        try:
+            metadata["created_at"] = datetime.datetime.strptime(
+                metadata["date"], "%Y/%m/%d %H:%M"
+            )
+            del metadata["date"]
+        except ValueError:
+            pass
     content = "\n".join(lines[idx:])
     return metadata, content
 
@@ -220,6 +233,18 @@ def remove_dash(content, level):
     return "\n".join(lines)
 
 
+def get_date(date_str):
+    "Get the date from a string trying different formats: 01 Jan 2020 then 01 January 2020"
+    try:
+        return datetime.datetime.strptime(date_str, "%d %B %Y")
+    except ValueError:
+        try:
+            return datetime.datetime.strptime(date_str, "%d %b %Y")
+        except ValueError:
+            print(f"Unable to parse date {date_str}", file=sys.stderr)
+            return date_str
+
+
 DATE2_REGEXP = re.compile(r"^## (\d\d \w+ \d\d\d\d)", re.MULTILINE)
 DATE3_REGEXP = re.compile(r"^### (\d\d \w+ \d\d\d\d)", re.MULTILINE)
 
@@ -247,13 +272,16 @@ def split_md_file(fname, md_dir):
         stat = os.stat(fname)
         os.utime(base_fname, (stat.st_atime, stat.st_mtime))
         for idx in range(1, len(history), 2):
-            history_date = datetime.datetime.strptime(history[idx], "%d %b %Y")
+            history_date = get_date(history[idx])
+            if isinstance(history_date, str):
+                continue
             if level == 1:
                 date = history_date.strftime("%d")
             else:
                 date = history_date.strftime("%Y%m%d")
             part_fname = os.path.join(md_dir, basename + date + ".md")
             with open(part_fname, "w", encoding="UTF-8") as fptr:
+                fptr.write(f"---\nReferer: {basename}\n---\n\n")
                 fptr.write("# " + history[idx] + remove_dash(history[idx + 1], level))
             mtime = (history_date + datetime.timedelta(hours=12)).timestamp()
             os.utime(part_fname, (mtime, mtime))
@@ -274,8 +302,10 @@ def write_output_file(md_file, out_dir, metadata):
         metadata, content = get_metadata(output.page_content)
         metadata["type"] = "notes"
     else:
-        content = output.page_content
-    metadata["last_accessed_at"] = (last_accessed_at,)
+        new_metadata, content = get_metadata(output.page_content)
+        metadata.update(new_metadata)
+        metadata["type"] = "history"
+    metadata["last_accessed_at"] = last_accessed_at
     if "url" not in metadata:
         metadata["url"] = f"file://{md_file}"
     print(f"saving {md_file=} with {metadata=}", file=sys.stderr)
@@ -307,7 +337,6 @@ def process_md_file(fname, out_dir, checksum_store):
         return False
     basename = os.path.basename(fname[:-3])
     oname = get_output_file_path(out_dir, basename)
-    stat = os.stat(fname)
     if is_same_time(fname, oname):
         print(f"skipping {fname} as there is no time change", file=sys.stderr)
         return False
@@ -321,8 +350,6 @@ def process_md_file(fname, out_dir, checksum_store):
         print(f"skipping {fname} as content did not change", file=sys.stderr)
         return False
     print(f"processed '{fname}'", file=sys.stderr)
-    # set the timestamp to be the same
-    os.utime(oname, (stat.st_atime, stat.st_mtime))
     return True
 
 
diff --git a/transform_txt.py b/transform_txt.py
index f2e23c5..ba42e5f 100755
--- a/transform_txt.py
+++ b/transform_txt.py
@@ -5,6 +5,7 @@
 into vector embeddings and store the vectors in a vector database.
 """
 
+import datetime
 import json
 import os
 import sys
@@ -54,12 +55,21 @@ def validate_and_extract_url(fname, basename, out_dir):
     if is_same_time(fname, oname):
         return False, None
     with open(fname, encoding="utf-8") as in_stream:
-        data = json.load(in_stream, object_hook=datetime_decoder)
+        try:
+            data = json.load(in_stream, object_hook=datetime_decoder)
+        except json.JSONDecodeError as exc:
+            print(f"Could not parse {fname}: {exc}", file=sys.stderr)
+            return False, None
+    if "metadata" not in data:
+        print(f"Could not find metadata in {fname}", file=sys.stderr)
+        return False, None
     metadata = data["metadata"]
     # convert the datetime to timestamp because chromadb does not
     # support datetime
-    if "last_accessed_at" in metadata:
-        metadata["last_accessed_at"] = metadata["last_accessed_at"].timestamp()
+    for key, val in metadata.items():
+        # check if the value is a datetime
+        if isinstance(val, datetime.datetime):
+            metadata[key] = val.timestamp()
     return metadata, data["text"]