Skip to content

Commit

Permalink
Merge pull request #139 from fako/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
fako committed Aug 25, 2019
2 parents 655e3d0 + e4a4749 commit 6e88bca
Show file tree
Hide file tree
Showing 45 changed files with 722 additions and 292 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ datagrowth/docs/_build/
datascope/settings.py
datascope/bootstrap.py
datascope/secrets.py
datascope/mysql/data/
datascope/logs/
datascope/statics/

Expand Down
13 changes: 9 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,23 @@ deploy: clean
sudo service celeryd restart

backup-db:
mysqldump -uroot -p --databases datascope > data/datascope.mysql.sql
pg_dump -h localhost -U postgres datascope > data/datascope.postgres.sql

import-db:
cat $(backup) | psql -h localhost -U postgres datascope

backup-data:
# Syncing local data to a harddrive
# -z means use compression
# -r means recursive
# -t means preserve creation and modification times
# -h means human readable output
# -v means verbose
rsync -zrthv --progress data /Volumes/Leo65/data/datascope

start-celery:
celery -A datascope worker --loglevel=info -B

start-mysql:
mysql --protocol=tcp -uroot -p

start-postgres:
psql -h localhost -U postgres -d postgres

Expand Down
2 changes: 1 addition & 1 deletion core/models/organisms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .individual import Individual
from .collective import Collective
from .growth import Growth
from .community import Community
from .community import Community, CommunityCollectionDocumentMixin
30 changes: 25 additions & 5 deletions core/models/organisms/community.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from django.contrib.contenttypes.fields import GenericForeignKey, GenericRelation, ContentType

from datagrowth.datatypes.documents.db.base import DataStorage
from datagrowth.datatypes import CollectionBase
from datagrowth.configuration import ConfigurationField
from core.models.organisms.states import CommunityState, COMMUNITY_STATE_CHOICES
from core.models.organisms import Growth, Collective, Individual
Expand Down Expand Up @@ -189,14 +190,14 @@ def setup_growth(self, *args):
inp = grw.output
elif inp is None:
inp = self.initial_input(*args)
elif inp.startswith("Collective"):
elif inp.startswith("Collective") or inp.startswith("Collection"):
if "#" in inp:
inp, identifier = inp.split("#")
else:
identifier = None
inp = self.create_organism(inp, sch, identifier)
inp.identifier = identifier
elif inp == "Individual":
elif inp == "Individual" or inp == "Document":
inp = self.create_organism(inp, sch)

out = growth_config["output"]
Expand All @@ -212,14 +213,14 @@ def setup_growth(self, *args):
out = grw.output
elif out == "&input":
out = inp
elif out.startswith("Collective"):
elif out.startswith("Collective") or out.startswith("Collection"):
if "#" in out:
out, identifier = out.split("#")
else:
identifier = None
out = self.create_organism(out, sch, identifier)
out.identifier = identifier
elif out == "Individual":
elif out == "Individual" or out == "Document":
out = self.create_organism(out, sch)
else:
raise AssertionError("Invalid value for output: {}".format(out))
Expand Down Expand Up @@ -338,7 +339,7 @@ def manifestation(self):
if data is None:
if not issubclass(processor.__class__, QuerySetProcessor):
data = self.kernel.content
elif isinstance(self.kernel, Collective):
elif isinstance(self.kernel, (Collective, CollectionBase,)):
data = self.kernel.documents.all()
else:
raise AssertionError("Kernel can't be other than Collective when using a QuerySetProcessor")
Expand Down Expand Up @@ -381,3 +382,22 @@ def __str__(self):
class Meta:
abstract = True
get_latest_by = "created_at"


class CommunityCollectionDocumentMixin(models.Model):

collection_set = GenericRelation("Collection", content_type_field="community_type", object_id_field="community_id")
document_set = GenericRelation("Document", content_type_field="community_type", object_id_field="community_id")
collective_set = None
individual_set = None

@property
def collections(self):
return self.collection_set

@property
def documents(self):
return self.document_set

class Meta:
abstract = True
26 changes: 13 additions & 13 deletions core/models/organisms/growth.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def prepare_contributions(self, success_resources):
def append_to_output(self, contributions):
assert isinstance(self.output, (Collective, CollectionBase)), \
"append_to_output expects a Collective or Collection as output"
self.output.update(contributions)
self.output.add(contributions)

def inline_by_key(self, contributions, inline_key):
assert isinstance(self.output, (Collective, CollectionBase)), \
Expand All @@ -185,24 +185,24 @@ def inline_by_key(self, contributions, inline_key):
self.output.identifier = "{}.{}".format(original_identifier, original_identifier)
self.output.save()
for contribution in contributions:
affected_individuals = self.output.documents.filter(identity=contribution[inline_key])
for individual in affected_individuals.iterator():
individual.properties[inline_key] = contribution
individual.clean()
individual.save()
affected_documents = self.output.documents.filter(identity=contribution[inline_key])
for document in affected_documents.iterator():
document.properties[inline_key] = contribution
document.clean()
document.save()

def _update_collection_by_key(self, contributions, update_key):
for contribution in contributions:
identifier = self.output.identifier
assert identifier == update_key, \
"Identifier of output '{}' does not match update key '{}'".format(identifier, update_key)
affected_individuals = self.output.documents.filter(identity=contribution[update_key])
for individual in affected_individuals.iterator():
individual.update(contribution)
individual.clean()
individual.save()
affected_documents = self.output.documents.filter(identity=contribution[update_key])
for document in affected_documents.iterator():
document.update(contribution)
document.clean()
document.save()

def _update_individual_by_key(self, contributions, update_key):
def _update_document_by_key(self, contributions, update_key):
for contribution in contributions:
assert update_key in self.output.properties, \
"Output does not contain update key '{}'".format(update_key)
Expand All @@ -214,7 +214,7 @@ def update_by_key(self, contributions, update_key):
if isinstance(self.output, (Collective, CollectionBase)):
self._update_collection_by_key(contributions, update_key)
elif isinstance(self.output, (Individual, DocumentBase)):
self._update_individual_by_key(contributions, update_key)
self._update_document_by_key(contributions, update_key)
else:
raise AssertionError("update_by_key expects a Collective/Collection or Individual/Document as output")

Expand Down
2 changes: 1 addition & 1 deletion core/models/organisms/individual.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from datagrowth.datatypes import DocumentBase, DocumentMysql


class Individual(DocumentBase, DocumentMysql):
class Individual(DocumentMysql, DocumentBase):

community = GenericForeignKey(ct_field="community_type", fk_field="community_id")
community_type = models.ForeignKey(ContentType, related_name="+")
Expand Down
21 changes: 17 additions & 4 deletions core/processors/filter.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,30 @@
from django.db.models.query import Q

from core.processors.base import QuerySetProcessor
from datagrowth.datatypes import DocumentPostgres


class FilterProcessor(QuerySetProcessor):

def get_query_filter_for_postgres(self, criteria):
query_filter = Q()
for key, values in criteria.items():
for value in values:
query_filter |= Q(**{"properties__{}".format(key): value})
return query_filter

def get_query_filter_for_non_postgres(self, criteria):
query_filter = Q()
for key, values in criteria.items():
for value in values:
query_filter |= Q(properties__contains='{}": "{}'.format(key, value))
return query_filter

def filter(self, query_set):
criteria = {
key: self.config.get(key).split("|") for key in self.config.select_keys
if self.config.get(key, None)
}
query_filter = Q()
for key, values in criteria.items():
for value in values:
query_filter |= Q(properties__contains='{}": "{}'.format(key, value))
query_filter = self.get_query_filter_for_postgres(criteria) if issubclass(query_set.model, DocumentPostgres) \
else self.get_query_filter_for_non_postgres(criteria)
return query_set.filter(query_filter)
4 changes: 2 additions & 2 deletions datagrowth/admin.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datagrowth.resources.http.admin import HttpResourceAdmin
from datagrowth.resources.shell.admin import ShellResourceAdmin

from datagrowth.datatypes.documents.admin import DataStorageAdmin
from datagrowth.datatypes.annotations.admin import AnnotationAdmin
from datagrowth.datatypes.documents.admin import DataStorageAdmin, DocumentAdmin
6 changes: 6 additions & 0 deletions datagrowth/datatypes/annotations/admin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from django.contrib import admin


class AnnotationAdmin(admin.ModelAdmin):
list_display = ("reference", "name", "annotation", "user", "created_at", "modified_at")
search_fields = ("reference", "string",)
4 changes: 4 additions & 0 deletions datagrowth/datatypes/documents/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@

class DataStorageAdmin(admin.ModelAdmin):
list_display = ['__str__', 'created_at', 'modified_at']


class DocumentAdmin(DataStorageAdmin):
search_fields = ["properties"]
10 changes: 5 additions & 5 deletions datagrowth/datatypes/documents/db/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@ def get_document_model(cls):

@property
def documents(self):
# This method should be smart about returning the correct document_set
Document = self.get_document_model()
return Document.objects.all()
raise NotImplementedError("CollectionBase needs to implement the documents property to work correctly")

@property
def annotations(self):
Expand Down Expand Up @@ -65,11 +63,13 @@ def validate(cls, data, schema):

def add(self, data, validate=True, reset=False, batch_size=500, collection=None):
"""
Update the instance with new data by adding to the Collection
or by updating Documents that members off the Collection.
Add new data to the Collection in batches, possibly deleting all data before adding.
:param data: The data to use for the update
:param validate: (optional) whether to validate data or not (yes by default)
:param reset: (optional) whether to delete existing data or not (no by default)
:param batch_size: (optional) how many instances to add in a single batch (default: 500)
:param collection: (optional) a collection instance to add the data to (default: self)
:return: A list of updated or created instances.
"""
collection = collection or self
Expand Down
8 changes: 8 additions & 0 deletions datagrowth/datatypes/documents/db/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ class DocumentBase(DataStorage):
identity = models.CharField(max_length=255, blank=True, null=True, db_index=True)
reference = models.CharField(max_length=255, blank=True, null=True, db_index=True)

@property
def properties(self):
raise NotImplementedError("DocumentBase does not implement properties, use DocumentPostgres or DocumentMysql")

def __getitem__(self, key):
return self.properties[key]

Expand Down Expand Up @@ -143,6 +147,8 @@ class DocumentMysql(models.Model):

class Meta:
abstract = True
get_latest_by = "id"
ordering = ["id"]


class DocumentPostgres(models.Model):
Expand All @@ -151,3 +157,5 @@ class DocumentPostgres(models.Model):

class Meta:
abstract = True
get_latest_by = "id"
ordering = ["id"]
29 changes: 7 additions & 22 deletions datagrowth/management/commands/dump_dataset.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,22 @@
import os

from django.core.serializers import serialize

from datagrowth.management.base import DatasetCommand
from datagrowth.utils import ibatch, get_dumps_path
from datagrowth.utils import get_dumps_path, object_to_disk, queryset_to_disk


class Command(DatasetCommand):
"""
Dumps a dataset by signature
"""

@staticmethod
def queryset_to_disk(queryset, json_file, batch_size=100):
count = queryset.all().count()
batch_iterator = ibatch(queryset.iterator(), batch_size=batch_size, progress_bar=True, total=count)
for batch in batch_iterator:
batch_data = serialize("json", batch, use_natural_foreign_keys=True)
json_file.writelines([batch_data + "\n"])

@staticmethod
def object_to_disk(object, json_file):
setattr(object, "current_growth", None) # resets dataset
batch_data = serialize("json", [object], use_natural_foreign_keys=True)
json_file.write(batch_data + "\n")

def handle_dataset(self, dataset, *args, **options):
setattr(dataset, "current_growth", None) # resets the dataset
destination = get_dumps_path(dataset)
if not os.path.exists(destination):
os.makedirs(destination)
file_name = os.path.join(destination, "{}.json".format(dataset.signature))
file_name = os.path.join(destination, "{}.{}.json".format(dataset.signature, dataset.id))
with open(file_name, "w") as json_file:
self.object_to_disk(dataset, json_file)
self.queryset_to_disk(dataset.growth_set, json_file)
self.queryset_to_disk(dataset.collections, json_file)
self.queryset_to_disk(dataset.documents, json_file)
object_to_disk(dataset, json_file)
queryset_to_disk(dataset.growth_set, json_file)
queryset_to_disk(dataset.collections, json_file)
queryset_to_disk(dataset.documents, json_file)
22 changes: 22 additions & 0 deletions datagrowth/management/commands/dump_resource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import os

from django.core.management.base import LabelCommand
from django.apps import apps

from datagrowth.utils import get_dumps_path, queryset_to_disk


class Command(LabelCommand):
"""
Dumps all objects from a Resource to file
"""

def handle_label(self, label, **options):
Resource = apps.get_model(label)
destination = get_dumps_path(Resource)
if not os.path.exists(destination):
os.makedirs(destination)
resource_name = Resource.get_name()
file_path = os.path.join(destination, "{}.dump.json".format(resource_name))
with open(file_path, "w") as dump_file:
queryset_to_disk(Resource.objects, dump_file)
Loading

0 comments on commit 6e88bca

Please sign in to comment.