Merge pull request #102 from Purg/dev/use-sha1

Change hash usage to sha1 from md5
nasa-jpl-memex · Jul 10, 2015 · df33b0b · df33b0b
2 parents fb7360e + 66c652b
commit df33b0b
Show file tree

Hide file tree

Showing 7 changed files with 52 additions and 74 deletions.
diff --git a/etc/system_config.json b/etc/system_config.json
@@ -12,7 +12,6 @@
             "type": "DataFileSet",
             "init": {
                 "root_directory": "FileDataSets/example_image/data",
-                "md5_chunk": 8,
                 // Tells implementation to interpret relative paths to the
                 // DATA_DIR variable as configured in smqtk_config module.
                 "data_relative": 1
@@ -22,7 +21,6 @@
             "type": "DataFileSet",
             "init": {
                 "root_directory": "FileDataSets/example_video/data",
-                "md5_chunk": 8,
                 "data_relative": 1
             }
         }
@@ -88,8 +86,7 @@
                 "work_directory": "ContentDescriptors/ColorDescriptor/csift/example_image",
                 "random_seed": 42,
                 // model generation parameters
-                "kmeans_k": 256,
-                "flann_sample_fraction": 1.0
+                "kmeans_k": 256
             }
         },
         "CD_CSIFT_Image_example_spatial": {
@@ -102,7 +99,6 @@
                 "random_seed": 42,
                 // model generation parameters
                 "kmeans_k": 256,
-                "flann_sample_fraction": 1.0,
                 "use_spatial_pyramid": 1
             }
         },
@@ -115,8 +111,7 @@
                 "work_directory": "ContentDescriptors/ColorDescriptor/rgsift/example_image",
                 "random_seed": 42,
                 // model generation parameters
-                "kmeans_k": 256,
-                "flann_sample_fraction": 1.0
+                "kmeans_k": 256
             }
         },
         "CD_CSIFT_Video_example": {
@@ -128,8 +123,7 @@
                 "work_directory": "ContentDescriptors/ColorDescriptor/csift/example_video",
                 "random_seed": 42,
                 // model generation parameters
-                "kmeans_k": 512,
-                "flann_sample_fraction": 1.0
+                "kmeans_k": 512
             }
         }
     },

diff --git a/python/smqtk/content_description/colordescriptor/colordescriptor.py b/python/smqtk/content_description/colordescriptor/colordescriptor.py
@@ -100,6 +100,7 @@ def is_usable(cls):
     def __init__(self, model_directory, work_directory,
                  kmeans_k=1024, flann_target_precision=0.95,
                  flann_sample_fraction=0.75,
+                 flann_autotune=False,
                  random_seed=None, use_spatial_pyramid=False):
         """
         Initialize a new ColorDescriptor interface instance.
@@ -127,6 +128,10 @@ def __init__(self, model_directory, work_directory,
             auto tuning. Default is 0.75 (75%).
         :type flann_sample_fraction: float
 
+        :param flann_autotune: Have FLANN module use auto-tuning algorithm to
+            find an optimal index representation and parameter set.
+        :type flann_autotune: bool
+
         :param use_spatial_pyramid: Use spacial pyramids when quantizing low
             level descriptors during feature computation.
         :type use_spatial_pyramid: bool
@@ -146,6 +151,7 @@ def __init__(self, model_directory, work_directory,
         self._kmeans_k = int(kmeans_k)
         self._flann_target_precision = float(flann_target_precision)
         self._flann_sample_fraction = float(flann_sample_fraction)
+        self._flann_autotune = bool(flann_autotune)
         self._use_sp = use_spatial_pyramid
         self._rand_seed = None if random_seed is None else int(random_seed)
 
@@ -292,8 +298,8 @@ def generate_model(self, data_set, **kwargs):
 
         For colorDescriptor, we generate raw features over the ingest data,
         compute a codebook via kmeans, and then create an index with FLANN via
-        the "autotune" algorithm to intelligently pick the fastest indexing
-        method.
+        the "autotune" or linear algorithm to intelligently pick the fastest
+        indexing method.
 
         :param num_elements: Number of data elements in the iterator
         :type num_elements: int
@@ -371,8 +377,9 @@ def generate_model(self, data_set, **kwargs):
                 "target_precision": self._flann_target_precision,
                 "sample_fraction": self._flann_sample_fraction,
                 "log_level": log_level,
-                "algorithm": "autotuned"
             }
+            if self._flann_autotune:
+                p['algorithm'] = "autotuned"
             if self._rand_seed is not None:
                 p['random_seed'] = self._rand_seed
             flann_params = flann.build_index(codebook, **p)

diff --git a/python/smqtk/data_rep/data_element_abstract.py b/python/smqtk/data_rep/data_element_abstract.py
@@ -28,6 +28,7 @@ class DataElement (object):
 
     def __init__(self):
         self._md5_cache = None
+        self._sha1_cache = None
         self._temp_filepath_stack = []
 
     @property
@@ -52,6 +53,15 @@ def md5(self):
             self._md5_cache = hashlib.md5(self.get_bytes()).hexdigest()
         return self._md5_cache
 
+    def sha1(self):
+        """
+        :return: SHA1 hex string of the data content.
+        :rtype: str
+        """
+        if not self._sha1_cache:
+            self._sha1_cache = hashlib.sha1(self.get_bytes()).hexdigest()
+        return self._sha1_cache
+
     def write_temp(self, temp_dir=None):
         """
         Write this data's bytes to a temporary file on disk, returning the path
@@ -121,30 +131,33 @@ def clean_temp(self):
                     os.remove(fp)
             self._temp_filepath_stack = []
 
-    ###
-    # Abstract methods
-    #
-
-    @abc.abstractmethod
-    def content_type(self):
-        """
-        :return: Standard type/subtype string for this data element, or None if
-            the content type is unknown.
-        :rtype: str or None
-        """
-        return
-
-    @abc.abstractmethod
     def uuid(self):
         """
         UUID for this data element. This many take different forms from integers
         to strings to a uuid.UUID instance. This must return a hashable data
         type.
 
+        By default, this ends up being the stringification of the SHA1 hash of
+        this data's bytes. Specific implementations may provide other UUIDs,
+        however.
+
         :return: UUID value for this data element. This return value should be
             hashable.
         :rtype: collections.Hashable
 
+        """
+        return self.sha1()
+
+    ###
+    # Abstract methods
+    #
+
+    @abc.abstractmethod
+    def content_type(self):
+        """
+        :return: Standard type/subtype string for this data element, or None if
+            the content type is unknown.
+        :rtype: str or None
         """
         return
 

diff --git a/python/smqtk/data_rep/data_element_impl/file_element.py b/python/smqtk/data_rep/data_element_impl/file_element.py
@@ -38,16 +38,6 @@ def content_type(self):
         """
         return self._content_type
 
-    def uuid(self):
-        """
-        UUID for this data element.
-
-        :return: UUID value for this data element.
-        :rtype: str
-
-        """
-        return self.md5()
-
     def get_bytes(self):
         """
         :return: Get the byte stream for this data element.

diff --git a/python/smqtk/data_rep/data_element_impl/memory_element.py b/python/smqtk/data_rep/data_element_impl/memory_element.py
@@ -47,17 +47,6 @@ def content_type(self):
         """
         return self._content_type
 
-    def uuid(self):
-        """
-        UUID for this data element.
-
-        :return: UUID value for this data element. This return value should be
-            hashable.
-        :rtype: collections.Hashable
-
-        """
-        return self.md5()
-
     def get_bytes(self):
         """
         :return: Get the byte stream for this data element.

diff --git a/python/smqtk/data_rep/data_element_impl/url_element.py b/python/smqtk/data_rep/data_element_impl/url_element.py
@@ -27,7 +27,6 @@ def __init__(self, url_address):
         super(DataUrlElement, self).__init__()
 
         self._url = url_address
-        self._md5_cache = None
 
         # make sure that url has a http:// or https:// prefix
         if not (self._url[:7] == "http://" or self._url[:8] == "https://"):
@@ -45,17 +44,6 @@ def content_type(self):
         # return MIMETYPES.guess_type(self._url)[0]
         return requests.get(self._url).headers['content-type']
 
-    def uuid(self):
-        """
-        UUID for this data element.
-
-        :return: UUID value for this data element. This return value should be
-            hashable.
-        :rtype: collections.Hashable
-
-        """
-        return self.md5()
-
     def get_bytes(self):
         """
         :return: Get the byte stream for this data element.

diff --git a/python/smqtk/data_rep/data_set_impl/file_set.py b/python/smqtk/data_rep/data_set_impl/file_set.py
@@ -1,11 +1,8 @@
 __author__ = 'purg'
 
 import cPickle
-
 import multiprocessing
-
 import os
-
 import re
 
 from smqtk_config import DATA_DIR
@@ -28,13 +25,13 @@ class DataFileSet (DataSet):
     """
 
     # Filename template for serialized files. Requires template
-    SERIAL_FILE_TEMPLATE = "UUID_%s.MD5_%s.dataElement"
+    SERIAL_FILE_TEMPLATE = "UUID_%s.SHA1_%s.dataElement"
 
     # Regex for matching file names as valid FileSet serialized elements
-    # - yields two groups, the first is the UUID, the second is the MD5 sum
-    SERIAL_FILE_RE = re.compile("UUID_(\w+).MD5_(\w+).dataElement")
+    # - yields two groups, the first is the UUID, the second is the SHA1 sum
+    SERIAL_FILE_RE = re.compile("UUID_(\w+).SHA1_(\w+).dataElement")
 
-    def __init__(self, root_directory, md5_chunk=8, data_relative=False):
+    def __init__(self, root_directory, sha1_chunk=10, data_relative=False):
         """
         Initialize a new or existing file set from a root directory.
 
@@ -43,9 +40,9 @@ def __init__(self, root_directory, md5_chunk=8, data_relative=False):
             description.
         :type root_directory: str
 
-        :param md5_chunk: Number of segments to split data element MD5 sum into
-            when saving element serializations.
-        :type md5_chunk: int
+        :param sha1_chunk: Number of segments to split data element SHA1 sum
+            into when saving element serializations.
+        :type sha1_chunk: int
 
         :param data_relative: If true, we should interpret ``root_directory`` as
             relative to the configured WORK_DIR parameter in the
@@ -59,7 +56,7 @@ def __init__(self, root_directory, md5_chunk=8, data_relative=False):
                 os.path.expanduser(root_directory)
             )
         )
-        self._md5_chunk = md5_chunk
+        self._sha1_chunk = sha1_chunk
 
         self._log.debug("Initializing FileSet under root dir: %s",
                         self._root_dir)
@@ -135,18 +132,18 @@ def _save_data_elements(self):
                 # Remove any temporary files an element may have generated
                 de.clean_temp()
 
-                md5 = de.md5()
+                sha1 = de.sha1()
                 # Leaving off trailing chunk so that we don't have a single
-                # directory per md5-sum.
+                # directory per sha1-sum.
                 containing_dir = \
                     os.path.join(self._root_dir,
-                                 *partition_string(md5, self._md5_chunk))
+                                 *partition_string(sha1, self._sha1_chunk))
                 if not os.path.isdir(containing_dir):
                     safe_create_dir(containing_dir)
 
                 output_fname = os.path.join(
                     containing_dir,
-                    self.SERIAL_FILE_TEMPLATE % (str(uuid), md5)
+                    self.SERIAL_FILE_TEMPLATE % (str(uuid), sha1)
                 )
                 with open(output_fname, 'wb') as ofile:
                     cPickle.dump(de, ofile)