Merge pull request #117 from marieai/develop

Merging after history rewrite on develop
marieai · Jul 2, 2024 · c98b063 · c98b063
2 parents c5c3947 + 2aea97c
commit c98b063
Show file tree

Hide file tree

Showing 111 changed files with 6,579 additions and 2,156 deletions.
diff --git a/.run/batch_document_ocr.run.xml b/.run/batch_document_ocr.run.xml
@@ -14,7 +14,7 @@
     <option name="ADD_SOURCE_ROOTS" value="true" />
     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
     <option name="SCRIPT_NAME" value="$PROJECT_DIR$/examples/batch_document_ocr.py" />
-    <option name="PARAMETERS" value="--config config.dev.json --pipeline default --input ~/tmp/analysis/marie-issues/108/195588965/195588965-0001.png --output_dir ~/tmp/analysis/marie-issues/108/195588965" />
+    <option name="PARAMETERS" value="--config config.dev.json --pipeline default --input $USER_HOME$/tmp/analysis/oda-research/204019943/PID_900_6705_0_204019943.tif  --output_dir $USER_HOME$/tmp/analysis/oda-research/204019943/300dpi" />
     <option name="SHOW_COMMAND_LINE" value="false" />
     <option name="EMULATE_TERMINAL" value="false" />
     <option name="MODULE_MODE" value="false" />

diff --git a/.run/marie gateway.run.xml → .run/marie gateway - params.run.xml b/.run/marie gateway.run.xml → .run/marie gateway - params.run.xml
@@ -1,6 +1,7 @@
 <component name="ProjectRunConfigurationManager">
-  <configuration default="false" name="marie gateway" type="PythonConfigurationType" factoryName="Python">
+  <configuration default="false" name="marie gateway - params" type="PythonConfigurationType" factoryName="Python">
     <module name="marie-ai" />
+    <option name="ENV_FILES" value="" />
     <option name="INTERPRETER_OPTIONS" value="" />
     <option name="PARENT_ENVS" value="true" />
     <envs>
@@ -13,7 +14,7 @@
     <option name="ADD_SOURCE_ROOTS" value="true" />
     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
     <option name="SCRIPT_NAME" value="$PROJECT_DIR$/marie/__main__.py" />
-    <option name="PARAMETERS" value="gateway --protocol GRPC --host 192.168.102.53 --port 52000 --discovery --discovery-host 127.0.0.1 --discovery-port 8500" />
+    <option name="PARAMETERS" value="gateway --protocols GRPC HTTP --ports 52000 51000 --host 0.0.0.0" />
     <option name="SHOW_COMMAND_LINE" value="false" />
     <option name="EMULATE_TERMINAL" value="false" />
     <option name="MODULE_MODE" value="false" />

diff --git a/.run/marie gateway-FILE.run.xml b/.run/marie gateway-FILE.run.xml
@@ -0,0 +1,25 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="marie gateway-FILE" type="PythonConfigurationType" factoryName="Python">
+    <module name="marie-ai" />
+    <option name="ENV_FILES" value="" />
+    <option name="INTERPRETER_OPTIONS" value="" />
+    <option name="PARENT_ENVS" value="true" />
+    <envs>
+      <env name="PYTHONUNBUFFERED" value="1" />
+    </envs>
+    <option name="SDK_HOME" value="" />
+    <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/" />
+    <option name="IS_MODULE_SDK" value="true" />
+    <option name="ADD_CONTENT_ROOTS" value="true" />
+    <option name="ADD_SOURCE_ROOTS" value="true" />
+    <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+    <option name="SCRIPT_NAME" value="$PROJECT_DIR$/marie/__main__.py" />
+    <option name="PARAMETERS" value="gateway --uses /mnt/data/marie-ai/config/service/gateway.yml" />
+    <option name="SHOW_COMMAND_LINE" value="false" />
+    <option name="EMULATE_TERMINAL" value="false" />
+    <option name="MODULE_MODE" value="false" />
+    <option name="REDIRECT_INPUT" value="false" />
+    <option name="INPUT_FILE" value="" />
+    <method v="2" />
+  </configuration>
+</component>
diff --git a/README-GB.md b/README-GB.md
@@ -476,6 +476,9 @@ https://github.com/fioresxcat/VAT_245/tree/fa526ac7e2ce9bb392ca66bd86305d69caee7
 # Table Transformer and Table Detection
 https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Table%20Transformer/Using_Table_Transformer_for_table_detection_and_table_structure_recognition.ipynb
 
+# IDEAS
+dedoc
+https://github.com/ispras/dedoc/blob/master/dedoc/structure_constructors/abstract_structure_constructor.py
 
 
 https://cloud.google.com/document-ai
@@ -505,4 +508,20 @@ event.json
 act.secrets
 ```
 MARIE_CORE_RELEASE_TOKEN=ghp_ABC
-```
+```
+
+## Pydantic
+```bash
+pydantic                                     1.10.15
+pydantic_core                                2.10.1
+```
+
+
+
+
+
+#  Rewriting history
+
+```bash
+  git filter-repo --mailmap mailmap --force
+```
diff --git a/README.md b/README.md
@@ -127,6 +127,13 @@ aws s3 cp some_file.txt s3://mybucket  --profile marie --endpoint-url http://loc
 aws s3 --profile marie --endpoint-url=http://127.0.0.1:8000 ls --recursive s3://
 ```
 
+Remove files from the bucket
+```shell
+aws s3 rm  s3://marie --recursive --profile marie --endpoint-url http://localhost:8000
+```
+
+
+
 # Production setup
 
 

diff --git a/config/service/bones.yml b/config/service/bones.yml
@@ -0,0 +1,278 @@
+jtype: Flow
+version: '1'
+protocol: grpc
+
+# Shared configuration
+shared_config:
+  storage: &storage
+    psql: &psql_conf_shared
+      provider: postgresql
+      hostname: 127.0.0.1
+      port: 5432
+      username: postgres
+      password: 123456
+      database: postgres
+      default_table: shared_docs
+
+  message: &message
+    amazon_mq :  &amazon_mq_conf_shared
+      provider: amazon-rabbitmq
+      hostname: ${{ ENV.AWS_MQ_HOSTNAME }}
+      port: 15672
+      username: ${{ ENV.AWS_MQ_USERNAME }}
+      password: ${{ ENV.AWS_MQ_PASSWORD }}
+      tls: True
+      virtualhost: /
+
+
+    rabbitmq :  &rabbitmq_conf_shared
+      provider: rabbitmq
+      hostname: ${{ ENV.RABBIT_MQ_HOSTNAME }}
+      port: ${{ ENV.RABBIT_MQ_PORT }}
+      username: ${{ ENV.RABBIT_MQ_USERNAME }}
+      password: ${{ ENV.RABBIT_MQ_PASSWORD }}
+      tls: False
+      virtualhost: /
+
+
+# Toast event tracking system
+# It can be backed by Message Queue and Database backed
+toast:
+  native:
+    enabled: True
+    path: /tmp/marie/events.json
+  rabbitmq:
+    <<: *rabbitmq_conf_shared
+    enabled : False
+  psql:
+    <<: *psql_conf_shared
+    default_table: event_tracking
+    enabled : False
+
+
+# Document Storage
+# The storage service is used to store the data that is being processed
+# Storage can be backed by S3 compatible
+
+storage:
+  # S3 configuration. Will be used only if value of backend is "s3"
+  s3:
+    enabled: False
+    metadata_only: False # If True, only metadata will be stored in the storage backend
+    # api endpoint to connect to. use AWS S3 or any S3 compatible object storage endpoint.
+    endpoint_url: ${{ ENV.S3_ENDPOINT_URL }}
+    # optional.
+    # access key id when using static credentials.
+    access_key_id: ${{ ENV.S3_ACCESS_KEY_ID }}
+    # optional.
+    # secret key when using static credentials.
+    secret_access_key: ${{ ENV.S3_SECRET_ACCESS_KEY }}
+    # Bucket name in s3
+    bucket_name: ${{ ENV.S3_BUCKET_NAME }}
+    # optional.
+    # Example: "region: us-east-2"
+    region: ${{ ENV.S3_REGION }}
+    # optional.
+    # enable if endpoint is http
+    insecure: True
+    # optional.
+    # enable if you want to use path style requests
+    addressing_style: path
+
+  # postgresql configuration. Will be used only if value of backend is "psql"
+  psql:
+    <<: *psql_conf_shared
+    default_table: store_metadata
+    enabled : False
+
+# Job Queue scheduler
+scheduler:
+  psql:
+    <<: *psql_conf_shared
+    default_table: job_queue
+    enabled : True
+
+# FLOW / GATEWAY configuration
+
+with:
+  port:
+    - 51000
+    - 52000
+  protocol:
+    - http
+    - grpc
+  discovery: True
+  discovery_host: 127.0.0.1
+  discovery_port: 8500
+
+  host: 127.0.0.1
+
+  # monitoring
+  monitoring: true
+  port_monitoring: 57843
+
+  event_tracking: True
+
+  expose_endpoints:
+    /document/extract:
+      methods: ["POST"]
+      summary: Extract data-POC
+      tags:
+        - extract
+    /status:
+      methods: ["POST"]
+      summary: Status
+      tags:
+        - extract
+
+    /text/status:
+      methods: ["POST"]
+      summary: Extract data
+      tags:
+        - extract
+
+    /ner/extract:
+      methods: ["POST"]
+      summary: Extract NER
+      tags:
+        - ner
+
+    /document/classify:
+      methods: ["POST"]
+      summary: Classify document at page level
+      tags:
+        - classify
+
+prefetch: 4
+
+executors:
+#  - name: extract_executor
+#    uses:
+#      jtype: TextExtractionExecutorMock
+#      metas:
+#        py_modules:
+#          - marie.executor.text
+#    timeout_ready: 3000000
+#    replicas: 1
+##    replicas: ${{ CONTEXT.gpu_device_count }}
+#    env :
+#      CUDA_VISIBLE_DEVICES: RR
+
+  - name: extract_t
+    uses:
+      jtype: TextExtractionExecutor
+#      jtype: TextExtractionExecutorMock
+      with:
+        storage:
+          # postgresql configuration. Will be used only if value of backend is "psql"
+          psql:
+            <<: *psql_conf_shared
+            default_table: extract_metadata
+            enabled: True
+        pipeline:
+          name: 'default'
+          page_classifier:
+            - model_name_or_path: 'marie/lmv3-medical-document-classification'
+              type: 'transformers'
+              device: 'cuda'
+              enabled: False
+              name: 'medical_page_classifier'
+            - model_name_or_path: 'marie/lmv3-medical-document-payer'
+              type: 'transformers'
+              enabled: False
+              device: 'cuda'
+              name: 'medical_payer_classifier'
+          page_indexer:
+            - model_name_or_path: 'marie/layoutlmv3-medical-document-indexer'
+              enabled: False
+              type: 'transformers'
+              device: 'cuda'
+              name: 'page_indexer_patient'
+              filter:
+                type: 'regex'
+                pattern: '.*'
+          page_splitter:
+            model_name_or_path: 'marie/layoutlmv3-medical-document-splitter'
+            enabled: True
+      metas:
+        py_modules:
+          - marie.executor.text
+    timeout_ready: 3000000
+    replicas: 1
+    #    replicas: ${{ CONTEXT.gpu_device_count }}
+    env:
+      CUDA_VISIBLE_DEVICES: RR
+
+#  - name: extract_xyz
+#    uses:
+#      jtype: TextExtractionExecutorMock
+#      metas:
+#        py_modules:
+#          - marie.executor.text
+#    timeout_ready: 3000000
+#    replicas: 1
+##    replicas: ${{ CONTEXT.gpu_device_count }}
+#    env :
+#      CUDA_VISIBLE_DEVICES: RR
+
+#  - name: ner_t
+#    uses:
+#      jtype: NerExtractionExecutor
+#      with:
+#        model_name_or_path : 'rms/layoutlmv3-large-corr-ner'
+#        <<: *psql_conf_shared
+#        storage_enabled : False
+#      metas:
+#        py_modules:
+##          - marie_server.executors.ner.mserve_torch
+#          - marie.executor.ner
+#    timeout_ready: 3000000
+##    replicas: 1
+#    replicas: ${{ CONTEXT.gpu_device_count }}
+#    env :
+#      CUDA_VISIBLE_DEVICES: RR
+
+#  - name: document_classifier
+#    uses:
+#      jtype: DocumentClassificationExecutor
+#      with:
+#        model_name_or_path :
+#          - 'marie/layoutlmv3-document-classification'
+#          - 'marie/layoutlmv3-document-classification'
+#        <<: *psql_conf_shared
+#        storage_enabled : False
+#      metas:
+#        py_modules:
+#          - marie.executor.classifier
+#    timeout_ready: 3000000
+##    replicas: 1
+#    replicas: ${{ CONTEXT.gpu_device_count }}
+#    env :
+#      CUDA_VISIBLE_DEVICES: RR
+##
+#  - name: overlay_t
+#    uses:
+#      jtype: OverlayExecutor
+#      with:
+#        model_name_or_path : 'rms/holder'
+#        <<: *storage_conf
+#        storage_enabled : True
+#      metas:
+#        py_modules:
+#          - marie.executor.overlay
+#    timeout_ready: 3000000
+#    replicas: 1
+
+# Authentication and Authorization configuration
+
+auth:
+  keys:
+    - name : service-A
+      api_key : mas_0aPJ9Q9nUO1Ac1vJTfffXEXs9FyGLf9BzfYgZ_RaHm707wmbfHJNPQ
+      enabled : True
+      roles : [admin, user]
+
+    - name : service-B
+      api_key : mau_t6qDi1BcL1NkLI8I6iM8z1va0nZP01UQ6LWecpbDz6mbxWgIIIZPfQ
+      enabled : True
+      roles : [admin, user]