Skip to content

Commit

Permalink
Merge pull request #102 from Purg/dev/use-sha1
Browse files Browse the repository at this point in the history
Change hash usage to sha1 from md5
  • Loading branch information
Purg committed Jul 10, 2015
2 parents fb7360e + 66c652b commit df33b0b
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 74 deletions.
12 changes: 3 additions & 9 deletions etc/system_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
"type": "DataFileSet",
"init": {
"root_directory": "FileDataSets/example_image/data",
"md5_chunk": 8,
// Tells implementation to interpret relative paths to the
// DATA_DIR variable as configured in smqtk_config module.
"data_relative": 1
Expand All @@ -22,7 +21,6 @@
"type": "DataFileSet",
"init": {
"root_directory": "FileDataSets/example_video/data",
"md5_chunk": 8,
"data_relative": 1
}
}
Expand Down Expand Up @@ -88,8 +86,7 @@
"work_directory": "ContentDescriptors/ColorDescriptor/csift/example_image",
"random_seed": 42,
// model generation parameters
"kmeans_k": 256,
"flann_sample_fraction": 1.0
"kmeans_k": 256
}
},
"CD_CSIFT_Image_example_spatial": {
Expand All @@ -102,7 +99,6 @@
"random_seed": 42,
// model generation parameters
"kmeans_k": 256,
"flann_sample_fraction": 1.0,
"use_spatial_pyramid": 1
}
},
Expand All @@ -115,8 +111,7 @@
"work_directory": "ContentDescriptors/ColorDescriptor/rgsift/example_image",
"random_seed": 42,
// model generation parameters
"kmeans_k": 256,
"flann_sample_fraction": 1.0
"kmeans_k": 256
}
},
"CD_CSIFT_Video_example": {
Expand All @@ -128,8 +123,7 @@
"work_directory": "ContentDescriptors/ColorDescriptor/csift/example_video",
"random_seed": 42,
// model generation parameters
"kmeans_k": 512,
"flann_sample_fraction": 1.0
"kmeans_k": 512
}
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def is_usable(cls):
def __init__(self, model_directory, work_directory,
kmeans_k=1024, flann_target_precision=0.95,
flann_sample_fraction=0.75,
flann_autotune=False,
random_seed=None, use_spatial_pyramid=False):
"""
Initialize a new ColorDescriptor interface instance.
Expand Down Expand Up @@ -127,6 +128,10 @@ def __init__(self, model_directory, work_directory,
auto tuning. Default is 0.75 (75%).
:type flann_sample_fraction: float
:param flann_autotune: Have FLANN module use auto-tuning algorithm to
find an optimal index representation and parameter set.
:type flann_autotune: bool
:param use_spatial_pyramid: Use spacial pyramids when quantizing low
level descriptors during feature computation.
:type use_spatial_pyramid: bool
Expand All @@ -146,6 +151,7 @@ def __init__(self, model_directory, work_directory,
self._kmeans_k = int(kmeans_k)
self._flann_target_precision = float(flann_target_precision)
self._flann_sample_fraction = float(flann_sample_fraction)
self._flann_autotune = bool(flann_autotune)
self._use_sp = use_spatial_pyramid
self._rand_seed = None if random_seed is None else int(random_seed)

Expand Down Expand Up @@ -292,8 +298,8 @@ def generate_model(self, data_set, **kwargs):
For colorDescriptor, we generate raw features over the ingest data,
compute a codebook via kmeans, and then create an index with FLANN via
the "autotune" algorithm to intelligently pick the fastest indexing
method.
the "autotune" or linear algorithm to intelligently pick the fastest
indexing method.
:param num_elements: Number of data elements in the iterator
:type num_elements: int
Expand Down Expand Up @@ -371,8 +377,9 @@ def generate_model(self, data_set, **kwargs):
"target_precision": self._flann_target_precision,
"sample_fraction": self._flann_sample_fraction,
"log_level": log_level,
"algorithm": "autotuned"
}
if self._flann_autotune:
p['algorithm'] = "autotuned"
if self._rand_seed is not None:
p['random_seed'] = self._rand_seed
flann_params = flann.build_index(codebook, **p)
Expand Down
41 changes: 27 additions & 14 deletions python/smqtk/data_rep/data_element_abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class DataElement (object):

def __init__(self):
self._md5_cache = None
self._sha1_cache = None
self._temp_filepath_stack = []

@property
Expand All @@ -52,6 +53,15 @@ def md5(self):
self._md5_cache = hashlib.md5(self.get_bytes()).hexdigest()
return self._md5_cache

def sha1(self):
"""
:return: SHA1 hex string of the data content.
:rtype: str
"""
if not self._sha1_cache:
self._sha1_cache = hashlib.sha1(self.get_bytes()).hexdigest()
return self._sha1_cache

def write_temp(self, temp_dir=None):
"""
Write this data's bytes to a temporary file on disk, returning the path
Expand Down Expand Up @@ -121,30 +131,33 @@ def clean_temp(self):
os.remove(fp)
self._temp_filepath_stack = []

###
# Abstract methods
#

@abc.abstractmethod
def content_type(self):
"""
:return: Standard type/subtype string for this data element, or None if
the content type is unknown.
:rtype: str or None
"""
return

@abc.abstractmethod
def uuid(self):
"""
UUID for this data element. This many take different forms from integers
to strings to a uuid.UUID instance. This must return a hashable data
type.
By default, this ends up being the stringification of the SHA1 hash of
this data's bytes. Specific implementations may provide other UUIDs,
however.
:return: UUID value for this data element. This return value should be
hashable.
:rtype: collections.Hashable
"""
return self.sha1()

###
# Abstract methods
#

@abc.abstractmethod
def content_type(self):
"""
:return: Standard type/subtype string for this data element, or None if
the content type is unknown.
:rtype: str or None
"""
return

Expand Down
10 changes: 0 additions & 10 deletions python/smqtk/data_rep/data_element_impl/file_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,6 @@ def content_type(self):
"""
return self._content_type

def uuid(self):
"""
UUID for this data element.
:return: UUID value for this data element.
:rtype: str
"""
return self.md5()

def get_bytes(self):
"""
:return: Get the byte stream for this data element.
Expand Down
11 changes: 0 additions & 11 deletions python/smqtk/data_rep/data_element_impl/memory_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,6 @@ def content_type(self):
"""
return self._content_type

def uuid(self):
"""
UUID for this data element.
:return: UUID value for this data element. This return value should be
hashable.
:rtype: collections.Hashable
"""
return self.md5()

def get_bytes(self):
"""
:return: Get the byte stream for this data element.
Expand Down
12 changes: 0 additions & 12 deletions python/smqtk/data_rep/data_element_impl/url_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def __init__(self, url_address):
super(DataUrlElement, self).__init__()

self._url = url_address
self._md5_cache = None

# make sure that url has a http:// or https:// prefix
if not (self._url[:7] == "http://" or self._url[:8] == "https://"):
Expand All @@ -45,17 +44,6 @@ def content_type(self):
# return MIMETYPES.guess_type(self._url)[0]
return requests.get(self._url).headers['content-type']

def uuid(self):
"""
UUID for this data element.
:return: UUID value for this data element. This return value should be
hashable.
:rtype: collections.Hashable
"""
return self.md5()

def get_bytes(self):
"""
:return: Get the byte stream for this data element.
Expand Down
27 changes: 12 additions & 15 deletions python/smqtk/data_rep/data_set_impl/file_set.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
__author__ = 'purg'

import cPickle

import multiprocessing

import os

import re

from smqtk_config import DATA_DIR
Expand All @@ -28,13 +25,13 @@ class DataFileSet (DataSet):
"""

# Filename template for serialized files. Requires template
SERIAL_FILE_TEMPLATE = "UUID_%s.MD5_%s.dataElement"
SERIAL_FILE_TEMPLATE = "UUID_%s.SHA1_%s.dataElement"

# Regex for matching file names as valid FileSet serialized elements
# - yields two groups, the first is the UUID, the second is the MD5 sum
SERIAL_FILE_RE = re.compile("UUID_(\w+).MD5_(\w+).dataElement")
# - yields two groups, the first is the UUID, the second is the SHA1 sum
SERIAL_FILE_RE = re.compile("UUID_(\w+).SHA1_(\w+).dataElement")

def __init__(self, root_directory, md5_chunk=8, data_relative=False):
def __init__(self, root_directory, sha1_chunk=10, data_relative=False):
"""
Initialize a new or existing file set from a root directory.
Expand All @@ -43,9 +40,9 @@ def __init__(self, root_directory, md5_chunk=8, data_relative=False):
description.
:type root_directory: str
:param md5_chunk: Number of segments to split data element MD5 sum into
when saving element serializations.
:type md5_chunk: int
:param sha1_chunk: Number of segments to split data element SHA1 sum
into when saving element serializations.
:type sha1_chunk: int
:param data_relative: If true, we should interpret ``root_directory`` as
relative to the configured WORK_DIR parameter in the
Expand All @@ -59,7 +56,7 @@ def __init__(self, root_directory, md5_chunk=8, data_relative=False):
os.path.expanduser(root_directory)
)
)
self._md5_chunk = md5_chunk
self._sha1_chunk = sha1_chunk

self._log.debug("Initializing FileSet under root dir: %s",
self._root_dir)
Expand Down Expand Up @@ -135,18 +132,18 @@ def _save_data_elements(self):
# Remove any temporary files an element may have generated
de.clean_temp()

md5 = de.md5()
sha1 = de.sha1()
# Leaving off trailing chunk so that we don't have a single
# directory per md5-sum.
# directory per sha1-sum.
containing_dir = \
os.path.join(self._root_dir,
*partition_string(md5, self._md5_chunk))
*partition_string(sha1, self._sha1_chunk))
if not os.path.isdir(containing_dir):
safe_create_dir(containing_dir)

output_fname = os.path.join(
containing_dir,
self.SERIAL_FILE_TEMPLATE % (str(uuid), md5)
self.SERIAL_FILE_TEMPLATE % (str(uuid), sha1)
)
with open(output_fname, 'wb') as ofile:
cPickle.dump(de, ofile)
Expand Down

0 comments on commit df33b0b

Please sign in to comment.