Merge pull request #139 from fako/development

Development
fako · Aug 25, 2019 · 6e88bca · 6e88bca
2 parents 655e3d0 + e4a4749
commit 6e88bca
Show file tree

Hide file tree

Showing 45 changed files with 722 additions and 292 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,7 +14,6 @@ datagrowth/docs/_build/
 datascope/settings.py
 datascope/bootstrap.py
 datascope/secrets.py
-datascope/mysql/data/
 datascope/logs/
 datascope/statics/
 

diff --git a/Makefile b/Makefile
@@ -9,18 +9,23 @@ deploy: clean
 	sudo service celeryd restart
 
 backup-db:
-	mysqldump -uroot -p --databases datascope > data/datascope.mysql.sql
 	pg_dump -h localhost -U postgres datascope > data/datascope.postgres.sql
 
+import-db:
+	cat $(backup) | psql -h localhost -U postgres datascope
+
 backup-data:
+	# Syncing local data to a harddrive
+	# -z means use compression
+	# -r means recursive
+	# -t means preserve creation and modification times
+	# -h means human readable output
+	# -v means verbose
 	rsync -zrthv --progress data /Volumes/Leo65/data/datascope
 
 start-celery:
 	celery -A datascope worker --loglevel=info -B
 
-start-mysql:
-	mysql --protocol=tcp -uroot -p
-
 start-postgres:
 	psql -h localhost -U postgres -d postgres
 

diff --git a/core/models/organisms/__init__.py b/core/models/organisms/__init__.py
@@ -1,4 +1,4 @@
 from .individual import Individual
 from .collective import Collective
 from .growth import Growth
-from .community import Community
+from .community import Community, CommunityCollectionDocumentMixin
diff --git a/core/models/organisms/community.py b/core/models/organisms/community.py
@@ -10,6 +10,7 @@
 from django.contrib.contenttypes.fields import GenericForeignKey, GenericRelation, ContentType
 
 from datagrowth.datatypes.documents.db.base import DataStorage
+from datagrowth.datatypes import CollectionBase
 from datagrowth.configuration import ConfigurationField
 from core.models.organisms.states import CommunityState, COMMUNITY_STATE_CHOICES
 from core.models.organisms import Growth, Collective, Individual
@@ -189,14 +190,14 @@ def setup_growth(self, *args):
                 inp = grw.output
             elif inp is None:
                 inp = self.initial_input(*args)
-            elif inp.startswith("Collective"):
+            elif inp.startswith("Collective") or inp.startswith("Collection"):
                 if "#" in inp:
                     inp, identifier = inp.split("#")
                 else:
                     identifier = None
                 inp = self.create_organism(inp, sch, identifier)
                 inp.identifier = identifier
-            elif inp == "Individual":
+            elif inp == "Individual" or inp == "Document":
                 inp = self.create_organism(inp, sch)
 
             out = growth_config["output"]
@@ -212,14 +213,14 @@ def setup_growth(self, *args):
                 out = grw.output
             elif out == "&input":
                 out = inp
-            elif out.startswith("Collective"):
+            elif out.startswith("Collective") or out.startswith("Collection"):
                 if "#" in out:
                     out, identifier = out.split("#")
                 else:
                     identifier = None
                 out = self.create_organism(out, sch, identifier)
                 out.identifier = identifier
-            elif out == "Individual":
+            elif out == "Individual" or out == "Document":
                 out = self.create_organism(out, sch)
             else:
                 raise AssertionError("Invalid value for output: {}".format(out))
@@ -338,7 +339,7 @@ def manifestation(self):
             if data is None:
                 if not issubclass(processor.__class__, QuerySetProcessor):
                     data = self.kernel.content
-                elif isinstance(self.kernel, Collective):
+                elif isinstance(self.kernel, (Collective, CollectionBase,)):
                     data = self.kernel.documents.all()
                 else:
                     raise AssertionError("Kernel can't be other than Collective when using a QuerySetProcessor")
@@ -381,3 +382,22 @@ def __str__(self):
     class Meta:
         abstract = True
         get_latest_by = "created_at"
+
+
+class CommunityCollectionDocumentMixin(models.Model):
+
+    collection_set = GenericRelation("Collection", content_type_field="community_type", object_id_field="community_id")
+    document_set = GenericRelation("Document", content_type_field="community_type", object_id_field="community_id")
+    collective_set = None
+    individual_set = None
+
+    @property
+    def collections(self):
+        return self.collection_set
+
+    @property
+    def documents(self):
+        return self.document_set
+
+    class Meta:
+        abstract = True
diff --git a/core/models/organisms/growth.py b/core/models/organisms/growth.py
@@ -174,7 +174,7 @@ def prepare_contributions(self, success_resources):
     def append_to_output(self, contributions):
         assert isinstance(self.output, (Collective, CollectionBase)), \
             "append_to_output expects a Collective or Collection as output"
-        self.output.update(contributions)
+        self.output.add(contributions)
 
     def inline_by_key(self, contributions, inline_key):
         assert isinstance(self.output, (Collective, CollectionBase)), \
@@ -185,24 +185,24 @@ def inline_by_key(self, contributions, inline_key):
         self.output.identifier = "{}.{}".format(original_identifier, original_identifier)
         self.output.save()
         for contribution in contributions:
-            affected_individuals = self.output.documents.filter(identity=contribution[inline_key])
-            for individual in affected_individuals.iterator():
-                individual.properties[inline_key] = contribution
-                individual.clean()
-                individual.save()
+            affected_documents = self.output.documents.filter(identity=contribution[inline_key])
+            for document in affected_documents.iterator():
+                document.properties[inline_key] = contribution
+                document.clean()
+                document.save()
 
     def _update_collection_by_key(self, contributions, update_key):
         for contribution in contributions:
             identifier = self.output.identifier
             assert identifier == update_key, \
                 "Identifier of output '{}' does not match update key '{}'".format(identifier, update_key)
-            affected_individuals = self.output.documents.filter(identity=contribution[update_key])
-            for individual in affected_individuals.iterator():
-                individual.update(contribution)
-                individual.clean()
-                individual.save()
+            affected_documents = self.output.documents.filter(identity=contribution[update_key])
+            for document in affected_documents.iterator():
+                document.update(contribution)
+                document.clean()
+                document.save()
 
-    def _update_individual_by_key(self, contributions, update_key):
+    def _update_document_by_key(self, contributions, update_key):
         for contribution in contributions:
             assert update_key in self.output.properties, \
                 "Output does not contain update key '{}'".format(update_key)
@@ -214,7 +214,7 @@ def update_by_key(self, contributions, update_key):
         if isinstance(self.output, (Collective, CollectionBase)):
             self._update_collection_by_key(contributions, update_key)
         elif isinstance(self.output, (Individual, DocumentBase)):
-            self._update_individual_by_key(contributions, update_key)
+            self._update_document_by_key(contributions, update_key)
         else:
             raise AssertionError("update_by_key expects a Collective/Collection or Individual/Document as output")
 

diff --git a/core/models/organisms/individual.py b/core/models/organisms/individual.py
@@ -6,7 +6,7 @@
 from datagrowth.datatypes import DocumentBase, DocumentMysql
 
 
-class Individual(DocumentBase, DocumentMysql):
+class Individual(DocumentMysql, DocumentBase):
 
     community = GenericForeignKey(ct_field="community_type", fk_field="community_id")
     community_type = models.ForeignKey(ContentType, related_name="+")

diff --git a/core/processors/filter.py b/core/processors/filter.py
@@ -1,17 +1,30 @@
 from django.db.models.query import Q
 
 from core.processors.base import QuerySetProcessor
+from datagrowth.datatypes import DocumentPostgres
 
 
 class FilterProcessor(QuerySetProcessor):
 
+    def get_query_filter_for_postgres(self, criteria):
+        query_filter = Q()
+        for key, values in criteria.items():
+            for value in values:
+                query_filter |= Q(**{"properties__{}".format(key): value})
+        return query_filter
+
+    def get_query_filter_for_non_postgres(self, criteria):
+        query_filter = Q()
+        for key, values in criteria.items():
+            for value in values:
+                query_filter |= Q(properties__contains='{}": "{}'.format(key, value))
+        return query_filter
+
     def filter(self, query_set):
         criteria = {
             key: self.config.get(key).split("|") for key in self.config.select_keys
             if self.config.get(key, None)
         }
-        query_filter = Q()
-        for key, values in criteria.items():
-            for value in values:
-                query_filter |= Q(properties__contains='{}": "{}'.format(key, value))
+        query_filter = self.get_query_filter_for_postgres(criteria) if issubclass(query_set.model, DocumentPostgres) \
+            else self.get_query_filter_for_non_postgres(criteria)
         return query_set.filter(query_filter)
diff --git a/datagrowth/admin.py b/datagrowth/admin.py
@@ -1,4 +1,4 @@
 from datagrowth.resources.http.admin import HttpResourceAdmin
 from datagrowth.resources.shell.admin import ShellResourceAdmin
-
-from datagrowth.datatypes.documents.admin import DataStorageAdmin
+from datagrowth.datatypes.annotations.admin import AnnotationAdmin
+from datagrowth.datatypes.documents.admin import DataStorageAdmin, DocumentAdmin
diff --git a/datagrowth/datatypes/annotations/admin.py b/datagrowth/datatypes/annotations/admin.py
@@ -0,0 +1,6 @@
+from django.contrib import admin
+
+
+class AnnotationAdmin(admin.ModelAdmin):
+    list_display = ("reference", "name", "annotation", "user", "created_at", "modified_at")
+    search_fields = ("reference", "string",)
diff --git a/datagrowth/datatypes/documents/admin.py b/datagrowth/datatypes/documents/admin.py
@@ -3,3 +3,7 @@
 
 class DataStorageAdmin(admin.ModelAdmin):
     list_display = ['__str__', 'created_at', 'modified_at']
+
+
+class DocumentAdmin(DataStorageAdmin):
+    search_fields = ["properties"]
diff --git a/datagrowth/datatypes/documents/db/collection.py b/datagrowth/datatypes/documents/db/collection.py
@@ -24,9 +24,7 @@ def get_document_model(cls):
 
     @property
     def documents(self):
-        # This method should be smart about returning the correct document_set
-        Document = self.get_document_model()
-        return Document.objects.all()
+        raise NotImplementedError("CollectionBase needs to implement the documents property to work correctly")
 
     @property
     def annotations(self):
@@ -65,11 +63,13 @@ def validate(cls, data, schema):
 
     def add(self, data, validate=True, reset=False, batch_size=500, collection=None):
         """
-        Update the instance with new data by adding to the Collection
-        or by updating Documents that members off the Collection.
+        Add new data to the Collection in batches, possibly deleting all data before adding.
 
         :param data: The data to use for the update
         :param validate: (optional) whether to validate data or not (yes by default)
+        :param reset: (optional) whether to delete existing data or not (no by default)
+        :param batch_size: (optional) how many instances to add in a single batch (default: 500)
+        :param collection: (optional) a collection instance to add the data to (default: self)
         :return: A list of updated or created instances.
         """
         collection = collection or self

diff --git a/datagrowth/datatypes/documents/db/document.py b/datagrowth/datatypes/documents/db/document.py
@@ -19,6 +19,10 @@ class DocumentBase(DataStorage):
     identity = models.CharField(max_length=255, blank=True, null=True, db_index=True)
     reference = models.CharField(max_length=255, blank=True, null=True, db_index=True)
 
+    @property
+    def properties(self):
+        raise NotImplementedError("DocumentBase does not implement properties, use DocumentPostgres or DocumentMysql")
+
     def __getitem__(self, key):
         return self.properties[key]
 
@@ -143,6 +147,8 @@ class DocumentMysql(models.Model):
 
     class Meta:
         abstract = True
+        get_latest_by = "id"
+        ordering = ["id"]
 
 
 class DocumentPostgres(models.Model):
@@ -151,3 +157,5 @@ class DocumentPostgres(models.Model):
 
     class Meta:
         abstract = True
+        get_latest_by = "id"
+        ordering = ["id"]
diff --git a/datagrowth/management/commands/dump_dataset.py b/datagrowth/management/commands/dump_dataset.py
@@ -1,37 +1,22 @@
 import os
 
-from django.core.serializers import serialize
-
 from datagrowth.management.base import DatasetCommand
-from datagrowth.utils import ibatch, get_dumps_path
+from datagrowth.utils import get_dumps_path, object_to_disk, queryset_to_disk
 
 
 class Command(DatasetCommand):
     """
     Dumps a dataset by signature
     """
 
-    @staticmethod
-    def queryset_to_disk(queryset, json_file, batch_size=100):
-        count = queryset.all().count()
-        batch_iterator = ibatch(queryset.iterator(), batch_size=batch_size, progress_bar=True, total=count)
-        for batch in batch_iterator:
-            batch_data = serialize("json", batch, use_natural_foreign_keys=True)
-            json_file.writelines([batch_data + "\n"])
-
-    @staticmethod
-    def object_to_disk(object, json_file):
-        setattr(object, "current_growth", None)  # resets dataset
-        batch_data = serialize("json", [object], use_natural_foreign_keys=True)
-        json_file.write(batch_data + "\n")
-
     def handle_dataset(self, dataset, *args, **options):
+        setattr(dataset, "current_growth", None)  # resets the dataset
         destination = get_dumps_path(dataset)
         if not os.path.exists(destination):
             os.makedirs(destination)
-        file_name = os.path.join(destination, "{}.json".format(dataset.signature))
+        file_name = os.path.join(destination, "{}.{}.json".format(dataset.signature, dataset.id))
         with open(file_name, "w") as json_file:
-            self.object_to_disk(dataset, json_file)
-            self.queryset_to_disk(dataset.growth_set, json_file)
-            self.queryset_to_disk(dataset.collections, json_file)
-            self.queryset_to_disk(dataset.documents, json_file)
+            object_to_disk(dataset, json_file)
+            queryset_to_disk(dataset.growth_set, json_file)
+            queryset_to_disk(dataset.collections, json_file)
+            queryset_to_disk(dataset.documents, json_file)
diff --git a/datagrowth/management/commands/dump_resource.py b/datagrowth/management/commands/dump_resource.py
@@ -0,0 +1,22 @@
+import os
+
+from django.core.management.base import LabelCommand
+from django.apps import apps
+
+from datagrowth.utils import get_dumps_path, queryset_to_disk
+
+
+class Command(LabelCommand):
+    """
+    Dumps all objects from a Resource to file
+    """
+
+    def handle_label(self, label, **options):
+        Resource = apps.get_model(label)
+        destination = get_dumps_path(Resource)
+        if not os.path.exists(destination):
+            os.makedirs(destination)
+        resource_name = Resource.get_name()
+        file_path = os.path.join(destination, "{}.dump.json".format(resource_name))
+        with open(file_path, "w") as dump_file:
+            queryset_to_disk(Resource.objects, dump_file)