diff --git a/.travis.yml b/.travis.yml
index bb3b552..527260a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -19,7 +19,7 @@ install:
   - if [ "$TRAVIS_OS_NAME" == "linux" ]; then if [ "$TEST_SUITE" = "opencl" ]; then mkdir -p cl12/CL; fi; fi
   - if [ "$TRAVIS_OS_NAME" == "linux" ]; then if [ "$TEST_SUITE" = "opencl" ]; then cd cl12/CL; fi; fi
   - if [ "$TRAVIS_OS_NAME" == "linux" ]; then if [ "$TEST_SUITE" = "opencl" ]; then wget https://www.khronos.org/registry/cl/api/1.2/cl.h; fi; fi
-  - if [ "$TRAVIS_OS_NAME" == "linux" ]; then if [ "$TEST_SUITE" = "opencl" ]; then wget https://www.khronos.org/registry/cl/api/1.2/cl.hpp; fi; fi
+#  - if [ "$TRAVIS_OS_NAME" == "linux" ]; then if [ "$TEST_SUITE" = "opencl" ]; then wget https://www.khronos.org/registry/cl/api/1.2/cl.hpp; fi; fi
   - if [ "$TRAVIS_OS_NAME" == "linux" ]; then if [ "$TEST_SUITE" = "opencl" ]; then wget https://www.khronos.org/registry/cl/api/1.2/cl_d3d10.h; fi; fi
   - if [ "$TRAVIS_OS_NAME" == "linux" ]; then if [ "$TEST_SUITE" = "opencl" ]; then wget https://www.khronos.org/registry/cl/api/1.2/cl_d3d11.h; fi; fi
   - if [ "$TRAVIS_OS_NAME" == "linux" ]; then if [ "$TEST_SUITE" = "opencl" ]; then wget https://www.khronos.org/registry/cl/api/1.2/cl_dx9_media_sharing.h; fi; fi
@@ -49,7 +49,10 @@ before_script:
   - if [ "$TEST_SUITE" = "reference" ]; then cmake -DCMAKE_BUILD_TYPE=Release ..; fi
   - if [ "$TEST_SUITE" = "opencl" ]; then cmake -DCMAKE_BUILD_TYPE=Release -DCN24_BUILD_OPENCL:BOOL=ON -DCN24_BUILD_OPENCL_CLBLAS:BOOL=ON ..; fi
 
-script: make
+script:
+  - make
+# OpenCL on Mac OS X supports a maximum work group size of (1,1,1) on CPUs, so we only do a sanity check in that case
+  - if [ "$TEST_SUITE" == "opencl" ] && [ "$TRAVIS_OS_NAME" == "osx" ]; then ./testOpenCL; else ./runBenchmark --ci;fi
 
 os:
   - linux
@@ -62,3 +65,8 @@ compiler:
 env:
   - TEST_SUITE=opencl
   - TEST_SUITE=reference
+
+matrix:
+  exclude:
+    - os: linux
+      env: TEST_SUITE=opencl
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b2bb95..23c8aa9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,9 @@
 cmake_minimum_required(VERSION 2.8)
 project(CN24 C CXX)
 
-set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type")
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Build type: Select either Debug, RelWithDebInfo or Release" FORCE)
+endif()
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
 
@@ -197,6 +199,7 @@ if(CN24_BUILD_ACCELERATE)
   message(STATUS "Using Accelerate include directory: ${ACCELERATE_INCLUDE_DIR}")
   include_directories(${ACCELERATE_INCLUDE_DIR})
   set(CN24_LIBS ${CN24_LIBS} ${ACCELERATE_BLAS})
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flax-vector-conversions")
   add_definitions("-DBUILD_BLAS")
   add_definitions("-DBLAS_ACCELERATE")
 endif()
@@ -204,9 +207,9 @@ endif()
 set(CN24_BUILD_OPENCL OFF CACHE BOOL "Build CN24 with OpenCL support")
 if(CN24_BUILD_OPENCL)
   find_library(OPENCL_ICDL NAMES OpenCL libOpenCL PATHS $ENV{CUDA_PATH}/lib/x64
-    $ENV{INTELOCLSDKROOT}/lib/x64)
+    $ENV{INTELOCLSDKROOT}/lib/x64 $ENV{AMDAPPSDKROOT}/lib/x86_64)
   find_path(OPENCL_INCLUDE_DIR CL/cl.h cl.h PATHS $ENV{CUDA_PATH}/include
-    $ENV{INTELOCLSDKROOT}/include)
+    $ENV{INTELOCLSDKROOT}/include $ENV{AMDAPPSDKROOT}/include)
   message(STATUS "Using OpenCL library: ${OPENCL_ICDL}")
   message(STATUS "Using OpenCL include directory: ${OPENCL_INCLUDE_DIR}")
   include_directories(${OPENCL_INCLUDE_DIR})
@@ -219,8 +222,8 @@ endif()
 set(CN24_BUILD_OPENCL_CLBLAS OFF CACHE BOOL "Build CN24 with OpenCL/clBLAS")
 if(CN24_BUILD_OPENCL_CLBLAS)
   if(CN24_BUILD_OPENCL)
-    find_library(CLBLAS_LIBRARY clBLAS libclBLAS)
-    find_path(CLBLAS_INCLUDE_DIR clBLAS.h)
+    find_library(CLBLAS_LIBRARY clBLAS libclBLAS PATHS $ENV{AMDAPPSDKROOT}/lib64/import)
+    find_path(CLBLAS_INCLUDE_DIR clBLAS.h PATHS $ENV{AMDAPPSDKROOT}/include)
     message(STATUS "Using OpenCL/clBLAS library: ${CLBLAS_LIBRARY}")
     message(STATUS "Using OpenCL/clBLAS include directory: ${CLBLAS_INCLUDE_DIR}")
     include_directories(${CLBLAS_INCLUDE_DIR})
diff --git a/include/cn24.h b/include/cn24.h
index 65403dc..a4ed5f1 100644
--- a/include/cn24.h
+++ b/include/cn24.h
@@ -17,14 +17,22 @@
 #include "cn24/util/Config.h"
 #include "cn24/util/Dataset.h"
 #include "cn24/util/Tensor.h"
+#include "cn24/util/CompressedTensor.h"
 #include "cn24/util/TensorViewer.h"
 #include "cn24/util/CombinedTensor.h"
+#include "cn24/util/TensorStream.h"
+#include "cn24/util/CompressedTensorStream.h"
+#include "cn24/util/FloatTensorStream.h"
 #include "cn24/util/PNGUtil.h"
 #include "cn24/util/JPGUtil.h"
 #include "cn24/util/Log.h"
 #include "cn24/util/KITTIData.h"
 #include "cn24/util/Init.h"
 #include "cn24/util/GradientTester.h"
+#include "cn24/util/StatAggregator.h"
+#include "cn24/util/StatSink.h"
+#include "cn24/util/ConsoleStatSink.h"
+#include "cn24/util/CSVStatSink.h"
 
 #include "cn24/math/TensorMath.h"
 
@@ -38,6 +46,7 @@
 #include "cn24/net/ConvolutionLayer.h"
 #include "cn24/net/MaxPoolingLayer.h"
 #include "cn24/net/AdvancedMaxPoolingLayer.h"
+#include "cn24/net/InputDownSamplingLayer.h"
 #include "cn24/net/LocalResponseNormalizationLayer.h"
 #include "cn24/net/UpscaleLayer.h"
 #include "cn24/net/LossFunctionLayer.h"
@@ -48,11 +57,13 @@
 #include "cn24/net/SpatialPriorLayer.h"
 #include "cn24/net/ConcatenationLayer.h"
 #include "cn24/net/GradientAccumulationLayer.h"
+#include "cn24/net/SumLayer.h"
 #include "cn24/net/Net.h"
 #include "cn24/net/Trainer.h"
 #include "cn24/net/NetGraph.h"
 #include "cn24/net/NetStatus.h"
 
 #include "cn24/factory/ConfigurableFactory.h"
+#include "cn24/factory/SkipLayerNetworkFactory.h"
 
 #endif
diff --git a/include/cn24/factory/ConfigurableFactory.h b/include/cn24/factory/ConfigurableFactory.h
index aed2b25..2f172ca 100644
--- a/include/cn24/factory/ConfigurableFactory.h
+++ b/include/cn24/factory/ConfigurableFactory.h
@@ -24,8 +24,20 @@
 #include "../util/Log.h"
 
 namespace Conv {
+  
+class Factory {
+public: 
+  virtual int AddLayers(Net& net, Connection data_layer_connection, const unsigned int output_classes, bool add_loss_layer = false, std::ostream& graph_output = std::cout) = 0;
+  virtual bool AddLayers(NetGraph& graph, NetGraphConnection data_layer_connection, const unsigned int output_classes, bool add_loss_layer = false) = 0;
+  virtual int patchsizex() = 0;
+  virtual int patchsizey() = 0;
+  virtual Layer* CreateLossLayer(const unsigned int output_classes, const datum loss_weight = 1.0) = 0;
+  virtual void InitOptimalSettings() = 0;
+  virtual TrainerSettings optimal_settings() const = 0;
+  virtual Method method() const = 0;
+};
 
-class ConfigurableFactory {
+class ConfigurableFactory : public Factory {
 public:
   /**
 	* @brief Builds a ConfigurableFactory using an input stream and a random seed
diff --git a/include/cn24/factory/SkipLayerNetworkFactory.h b/include/cn24/factory/SkipLayerNetworkFactory.h
new file mode 100644
index 0000000..8b97f8b
--- /dev/null
+++ b/include/cn24/factory/SkipLayerNetworkFactory.h
@@ -0,0 +1,28 @@
+#ifndef CONV_SKIPLAYERNETWORKFACTORY_H
+#define CONV_SKIPLAYERNETWORKFACTORY_H
+
+#include <iostream>
+
+#include "../net/Net.h"
+#include "../net/NetGraph.h"
+#include "../net/Trainer.h"
+#include "../util/Dataset.h"
+#include "../util/Log.h"
+#include "ConfigurableFactory.h"
+
+namespace Conv {
+
+class SkipLayerNetworkFactory : public Factory {
+  int AddLayers(Net& net, Connection data_layer_connection, const unsigned int output_classes, bool add_loss_layer = false, std::ostream& graph_output = std::cout);
+  bool AddLayers(NetGraph& graph, NetGraphConnection data_layer_connection, const unsigned int output_classes, bool add_loss_layer = false);
+  int patchsizex();
+  int patchsizey();
+  Layer* CreateLossLayer(const unsigned int output_classes, const datum loss_weight = 1.0);
+  void InitOptimalSettings();
+  TrainerSettings optimal_settings() const;
+  Method method() const;
+};
+  
+}
+
+#endif
\ No newline at end of file
diff --git a/include/cn24/math/TensorMath.h b/include/cn24/math/TensorMath.h
index d54283b..4a77d58 100644
--- a/include/cn24/math/TensorMath.h
+++ b/include/cn24/math/TensorMath.h
@@ -100,6 +100,25 @@ class TensorMath {
   static void SMS2(
     const Tensor& source,
     Tensor& target);
+  
+  static void DOWN(
+    const Tensor& source,
+    Tensor& target,
+    const int region_width,
+    const int region_height,
+    const datum target_factor);
+  
+  static void UP(
+    const Tensor& source,
+    Tensor& target,
+    const int region_width,
+    const int region_height,
+    const datum target_factor);
+  
+  static void ADD(
+    const Tensor& source_a,
+    const Tensor& source_b,
+    Tensor& target);
 };
   
 }
diff --git a/include/cn24/net/BinaryStatLayer.h b/include/cn24/net/BinaryStatLayer.h
index 7d60373..1343acc 100644
--- a/include/cn24/net/BinaryStatLayer.h
+++ b/include/cn24/net/BinaryStatLayer.h
@@ -19,6 +19,7 @@
 
 #include "Layer.h"
 #include "StatLayer.h"	
+#include "../util/StatAggregator.h"
 
 namespace Conv {
 
@@ -33,6 +34,8 @@ class BinaryStatLayer: public Layer, public StatLayer {
 	*/
   BinaryStatLayer(unsigned int thresholds = 24, const datum min_t = -0.458333,
                   const datum max_t = 0.5);
+  
+  void UpdateAll();
 
   /**
 	* @brief Prints the current statistics
@@ -80,6 +83,13 @@ class BinaryStatLayer: public Layer, public StatLayer {
   datum* false_negatives_ = nullptr;
   
   bool disabled_ = false;
+
+  StatDescriptor* stat_fpr_ = nullptr;
+  StatDescriptor* stat_fnr_ = nullptr;
+  StatDescriptor* stat_pre_ = nullptr;
+  StatDescriptor* stat_rec_ = nullptr;
+  StatDescriptor* stat_acc_ = nullptr;
+  StatDescriptor* stat_f1_ = nullptr;
 };
 
 }
diff --git a/include/cn24/net/ConfusionMatrixLayer.h b/include/cn24/net/ConfusionMatrixLayer.h
index f5ca02b..143ffb9 100644
--- a/include/cn24/net/ConfusionMatrixLayer.h
+++ b/include/cn24/net/ConfusionMatrixLayer.h
@@ -19,7 +19,9 @@
 #include <vector>
 
 #include "Layer.h"
-#include "StatLayer.h"	
+#include "StatLayer.h"
+
+#include "../util/StatAggregator.h"
 
 namespace Conv {
 
@@ -34,10 +36,11 @@ class ConfusionMatrixLayer: public Layer, public StatLayer {
   explicit ConfusionMatrixLayer(std::vector<std::string> names,
                                 const unsigned int classes);
 
+  void UpdateAll();
   /**
 	* @brief Prints the current statistics
 	*
-	* @param prefix This is printed before every line ouf output
+	* @param prefix This is printed before every line of output
 	* @param training Whether the net is currently training. Affects output color
 	*/
   void Print (std::string prefix, bool training);
@@ -80,6 +83,11 @@ class ConfusionMatrixLayer: public Layer, public StatLayer {
   long double total_ = 0;
   long double right_ = 0;
   long double* per_class_ = nullptr;
+  
+  StatDescriptor* stat_orr_ = nullptr;
+  StatDescriptor* stat_arr_ = nullptr;
+  StatDescriptor* stat_iou_ = nullptr;
+  
 };
 
 }
diff --git a/include/cn24/net/InputDownSamplingLayer.h b/include/cn24/net/InputDownSamplingLayer.h
new file mode 100644
index 0000000..1a81b61
--- /dev/null
+++ b/include/cn24/net/InputDownSamplingLayer.h
@@ -0,0 +1,70 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */  
+/**
+ * @file InputDownSamplingLayer.h
+ * @class InputDownSamplingLayer
+ * @brief Layer that scales input down
+ * 
+ * @author Clemens-Alexander Brust (ikosa dot de at gmail dot com)
+ */
+
+#ifndef CONV_INPUTDOWNSAMPLINGLAYER_H
+#define CONV_INPUTDOWNSAMPLINGLAYER_H
+
+#include <string>
+#include <sstream>
+
+#include "SimpleLayer.h"
+
+
+namespace Conv {
+  
+class InputDownSamplingLayer : public SimpleLayer {
+public:
+  /**
+   * @brief Constructs a max-pooling Layer.
+   * 
+   * @param region_width Width of the pooling regions
+   * @param region_height Height of the pooling regions
+   */
+  InputDownSamplingLayer(const unsigned int region_width,
+                  const unsigned int region_height);
+  
+  // Implementations for SimpleLayer
+  bool CreateOutputs (const std::vector< CombinedTensor* >& inputs, std::vector< CombinedTensor* >& outputs);
+  bool Connect (const CombinedTensor* input, CombinedTensor* output);
+  void FeedForward();
+  void BackPropagate();
+  
+  inline unsigned int Gain() {
+    return gain / (region_width_ * region_height_);
+  }
+  
+	inline std::string GetLayerDescription() {
+		std::ostringstream ss;
+		ss << "Input Down-Sampling Layer (" << region_width_ << "x" << region_height_ << ")";
+		return ss.str();
+	}
+
+  bool IsOpenCLAware();
+private:
+  // Settings
+  unsigned int region_width_ = 0;
+  unsigned int region_height_ = 0;
+  
+  // Feature map dimensions
+  unsigned int input_width_ = 0;
+  unsigned int input_height_ = 0;
+  unsigned int output_width_ = 0;
+  unsigned int output_height_ = 0;
+  
+  unsigned int maps_ = 0;
+};
+
+}
+
+#endif
diff --git a/include/cn24/net/NetGraph.h b/include/cn24/net/NetGraph.h
index 2854d38..4ea855e 100644
--- a/include/cn24/net/NetGraph.h
+++ b/include/cn24/net/NetGraph.h
@@ -20,6 +20,8 @@
 #include "NetStatus.h"
 #include "../util/TensorViewer.h"
 
+#include "StatLayer.h"
+
 #include <vector>
 
 namespace Conv {
@@ -106,6 +108,12 @@ class NetGraph : public NetStatus {
 	// Output
 	void PrintGraph(std::ostream& graph_output);
   void SetLayerViewEnabled(bool enabled) { layerview_enabled_ = enabled; }
+  void SetStatLayersEnabled(bool enabled) {
+    for (unsigned int n = 0; n < GetStatNodes().size(); n++) {
+      StatLayer* stat_layer = dynamic_cast<StatLayer*>(GetStatNodes()[n]->layer);
+      stat_layer->SetDisabled(!enabled);
+    } 
+  }
 	datum AggregateLoss();
 
 	// Status
diff --git a/include/cn24/net/NetStatus.h b/include/cn24/net/NetStatus.h
index e77edfa..7979921 100644
--- a/include/cn24/net/NetStatus.h
+++ b/include/cn24/net/NetStatus.h
@@ -15,6 +15,9 @@
 #ifndef CONV_NETSTATUS_H
 #define CONV_NETSTATUS_H
 
+#include "../util/Init.h"
+#include "../util/StatAggregator.h"
+
 namespace Conv {
 
 class NetStatus{
@@ -29,7 +32,10 @@ class NetStatus{
    * 
    * @param is_testing The new testing status
    */
-  inline void SetIsTesting(bool is_testing) { is_testing_ = is_testing; }
+  inline void SetIsTesting(bool is_testing) { 
+    is_testing_ = is_testing;
+    System::stat_aggregator->hardcoded_stats_.is_training = !is_testing;
+  }
 private:
 	bool is_testing_ = false;
 };
diff --git a/include/cn24/net/StatLayer.h b/include/cn24/net/StatLayer.h
index 783b2df..23a89f1 100644
--- a/include/cn24/net/StatLayer.h
+++ b/include/cn24/net/StatLayer.h
@@ -23,6 +23,7 @@ namespace Conv {
   
 class StatLayer {
 public:
+  virtual void UpdateAll() = 0;
   virtual void Print(std::string prefix, bool training) = 0;
   virtual void Reset() = 0;
 	virtual void SetDisabled(bool disabled) = 0;
diff --git a/include/cn24/net/SumLayer.h b/include/cn24/net/SumLayer.h
new file mode 100644
index 0000000..6ab4f13
--- /dev/null
+++ b/include/cn24/net/SumLayer.h
@@ -0,0 +1,47 @@
+/**
+ * @file SumLayer.h
+ * @class SumLayer
+ * @brief Concatenates the inputs (used to add non-convolvable information).
+ *
+ * @author Clemens-Alexander Brust (ikosa dot de at gmail dot com)
+ */
+
+#ifndef CONV_SUMLAYER_H
+#define CONV_SUMLAYER_H
+
+#include <string>
+
+#include "Layer.h"
+
+namespace Conv {
+
+class SumLayer: public Layer {
+public:
+  SumLayer();
+
+  // Layer implementations
+  bool CreateOutputs (const std::vector< CombinedTensor* >& inputs,
+                      std::vector< CombinedTensor* >& outputs);
+  bool Connect (const std::vector< CombinedTensor* >& inputs,
+                const std::vector< CombinedTensor* >& outputs,
+                const NetStatus* status );
+  void FeedForward();
+  void BackPropagate();
+
+  std::string GetLayerDescription() { return "Sum Layer"; }
+  void CreateBufferDescriptors(std::vector< NetGraphBuffer >& buffers) {
+    NetGraphBuffer buffer;
+    buffer.description = "Output";
+    buffers.push_back(buffer);
+  };
+private:
+  CombinedTensor* input_a_ = nullptr;
+  CombinedTensor* input_b_ = nullptr;
+  CombinedTensor* output_ = nullptr;
+  
+  unsigned int maps_ = 0;
+  unsigned int samples_ = 0;
+};
+
+}
+#endif
diff --git a/include/cn24/net/Trainer.h b/include/cn24/net/Trainer.h
index 99ec184..c54ce0a 100644
--- a/include/cn24/net/Trainer.h
+++ b/include/cn24/net/Trainer.h
@@ -18,6 +18,7 @@
 #include <cmath>
 
 #include "../util/CombinedTensor.h"
+#include "../util/StatAggregator.h"
 #include "TrainingLayer.h"
 #include "NetGraph.h"
 
@@ -41,6 +42,7 @@ struct TrainerSettings {
   datum mu = 1.75;
   datum eta = 1.5;
   OPTIMIZATION_METHOD optimization_method = GRADIENT_DESCENT;
+  bool stats_during_training = true;
   unsigned int pbatchsize = 1;
   unsigned int sbatchsize = 1;
   unsigned int iterations = 500;
@@ -61,7 +63,7 @@ class Trainer {
 	*
 	* @param epochs The number of epochs to train
 	*/
-  void Train (unsigned int epochs);
+  void Train (unsigned int epochs, bool do_snapshot);
 
   /**
 	* @brief Test the net by running every test sample through the net
@@ -101,9 +103,13 @@ class Trainer {
                                           * (datum) iteration,
                                           -settings_.exponent);
   }
+  
+  inline void SetStatsDuringTraining(bool enable) { settings_.stats_during_training = enable; }
 
 private:
   void ApplyGradients (datum lr);
+  void InitializeStats();
+
   // References for easy access
   NetGraph& graph_;
   std::vector<CombinedTensor*> parameters_;
@@ -116,12 +122,23 @@ class Trainer {
 
   // Sample count
   unsigned int sample_count_ = 0;
+  unsigned int weight_count_ = 0;
 
   // Learning options
   TrainerSettings settings_;
 
   // State
   unsigned int epoch_ = 0;
+
+  // Global state
+  static bool stats_are_initialized_;
+  static StatDescriptor* stat_aggloss_;
+  static StatDescriptor* stat_qp_caseA_;
+  static StatDescriptor* stat_qp_caseB_;
+  static StatDescriptor* stat_qp_caseC_;
+  static StatDescriptor* stat_qp_caseM_;
+  static StatDescriptor* stat_fps_;
+  static StatDescriptor* stat_sps_;
 };
 
 
diff --git a/include/cn24/net/UpscaleLayer.h b/include/cn24/net/UpscaleLayer.h
index 6f834c7..a8d6a55 100644
--- a/include/cn24/net/UpscaleLayer.h
+++ b/include/cn24/net/UpscaleLayer.h
@@ -41,6 +41,8 @@ class UpscaleLayer : public SimpleLayer {
   void FeedForward();
   void BackPropagate();
   
+  bool IsOpenCLAware() { return true; }
+  
 	inline std::string GetLayerDescription() {
 		std::ostringstream ss;
 		ss << "Upscale Layer (" << region_width_ << "x" << region_height_ << ")";
diff --git a/include/cn24/util/CSVStatSink.h b/include/cn24/util/CSVStatSink.h
new file mode 100644
index 0000000..f377e0c
--- /dev/null
+++ b/include/cn24/util/CSVStatSink.h
@@ -0,0 +1,44 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */  
+/**
+ * @file CSVStatSink.h
+ * @brief Gets data from StatAggregator and processes it into a CSV file
+ *
+ * @author Clemens-Alexander Brust (ikosa dot de at gmail dot com)
+ */
+
+#ifndef CONV_CSVSTATSINK_H
+#define CONV_CSVSTATSINK_H
+
+#include <functional>
+#include <vector>
+#include <iomanip>
+#include <fstream>
+
+#include "Config.h"
+#include "Log.h"
+
+#include "StatAggregator.h"
+#include "StatSink.h"
+
+namespace Conv
+{
+class CSVStatSink : public StatSink {
+public:
+  ~CSVStatSink() { if(csv_stream_ != nullptr) {csv_stream_->close(); delete csv_stream_; }}
+  virtual void Initialize(std::vector<StatDescriptor*>& stat_descriptors);
+  virtual void Process(HardcodedStats& hardcoded_stats, std::vector<Stat*>& stats);
+  virtual void SetCurrentExperiment(std::string current_experiment);
+  
+private:
+  std::vector<StatDescriptor*> stat_descriptors_;
+  std::ofstream* csv_stream_ = nullptr;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/include/cn24/util/CompressedTensor.h b/include/cn24/util/CompressedTensor.h
new file mode 100644
index 0000000..45ff78b
--- /dev/null
+++ b/include/cn24/util/CompressedTensor.h
@@ -0,0 +1,130 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */  
+
+#ifndef CONV_COMPRESSEDTENSOR_H
+#define CONV_COMPRESSEDTENSOR_H
+
+#include <cstddef>
+#include <string>
+#include <iostream>
+
+#include "Log.h"
+#include "Config.h"
+
+#include "Tensor.h"
+
+namespace Conv {
+
+class CompressedTensor;
+/**
+ * @brief Prints size to the ostream, may be helpful.
+ */
+std::ostream& operator<< (std::ostream& output, const CompressedTensor& tensor);
+
+class CompressedTensor {
+public:
+  /**
+   * @brief Constructs an empty CompressedTensor of zero size.
+   */
+  CompressedTensor ();
+  
+  ~CompressedTensor ();
+  
+  /*
+   * Compression and decompression encapsulated
+   */
+  void Compress(Tensor& tensor);
+  void Decompress(Tensor& tensor, datum* preallocated_memory = nullptr);
+
+
+  /**
+   * @brief Serializes the CompressedTensor to the stream.
+   *
+   * @param output The output stream
+   * @param convert Convert to byte
+   */
+  void Serialize (std::ostream& output);
+
+  /**
+   * @brief Deserializes from the stream.
+   *
+   * Note that this resizes the stream if necessary and overwrites its content.
+   * @param input The input stream
+   * @param head_only Set to true to only read the dimensions
+   * @param try_mmap Set to true to attempt to memory map the file
+   * @param fd File descriptor for the SAME file as input's underlying
+   */
+  void Deserialize (std::istream& input, bool head_only = false, bool try_mmap = false, int fd = 0);
+  
+	/**
+	 * @brief Writes some tensor statistics to the debug output
+	 */
+	void PrintStats();
+
+  /**
+   * @brief Deallocates the memory if data_ptr is not a nullptr.
+   */
+  void DeleteIfPossible();
+
+  // Accessors for the size information
+  inline std::size_t samples() const {
+    return samples_;
+  }
+  inline std::size_t maps() const {
+    return maps_;
+  }
+  inline std::size_t height() const {
+    return height_;
+  }
+  inline std::size_t width() const {
+    return width_;
+  }
+  inline std::size_t elements() const {
+    return elements_;
+  }
+  inline std::size_t compressed_length() const {
+    return compressed_length_;
+  }
+
+private:
+  /**
+   * @brief Resizes the CompressedTensor with data loss.
+   */
+  void Resize (const std::size_t samples, const std::size_t width,
+               const std::size_t height, const std::size_t maps,
+               const std::size_t compressed_length,
+               char* const preallocated_memory = nullptr, bool mmapped = false );
+
+  // Pointer to the actual data
+  char* compressed_data_ptr_ = nullptr;
+
+  // Sizes
+  std::size_t samples_ = 0;
+  std::size_t maps_ = 0;
+  std::size_t height_ = 0;
+  std::size_t width_ = 0;
+  std::size_t elements_ = 0;
+  
+  std::size_t compressed_length_ = 0;
+  
+  static void CompressData(void* uncompressed, const std::size_t& uncompressed_elements, void* compressed, std::size_t& compressed_length);
+  static void DecompressData(void* uncompressed, std::size_t& uncompressed_elements, void* compressed, const std::size_t& compressed_length);
+  
+public:
+  
+  /**
+   * @brief If this is true, the CompressedTensor was memory mapped
+   */
+  bool mmapped_ = false;
+  void* original_mmap_ = nullptr;
+};
+
+
+
+}
+
+#endif
diff --git a/include/cn24/util/CompressedTensorStream.h b/include/cn24/util/CompressedTensorStream.h
new file mode 100644
index 0000000..a8f67a4
--- /dev/null
+++ b/include/cn24/util/CompressedTensorStream.h
@@ -0,0 +1,52 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */  
+
+#ifndef CONV_COMPRESSEDTENSORSTREAM_H
+#define CONV_COMPRESSEDTENSORSTREAM_H
+
+#include <cstddef>
+#include <string>
+#include <iostream>
+
+#include "Log.h"
+#include "Config.h"
+
+#include "Tensor.h"
+#include "CompressedTensor.h"
+
+#include "TensorStream.h"
+
+#define CN24_CTS_MAGIC 0xC24CC24CC24CC24C
+
+namespace Conv {
+  
+class CompressedTensorStream : public TensorStream {
+public: 
+  
+  ~CompressedTensorStream() {
+    for(CompressedTensor* tensor: tensors_) {
+      delete tensor;
+    }
+  }
+  
+  // TensorStream implementations
+  std::size_t GetWidth(unsigned int index) { return index < tensors_.size() ? tensors_[index]->width() : 0; }
+  std::size_t GetHeight(unsigned int index) { return index < tensors_.size() ? tensors_[index]->height() : 0; }
+  std::size_t GetMaps(unsigned int index) { return index < tensors_.size() ? tensors_[index]->maps() : 0; }
+  std::size_t GetSamples(unsigned int index) { return index < tensors_.size() ? tensors_[index]->samples() : 0; }
+  unsigned int GetTensorCount() { return tensors_.size(); }
+  unsigned int LoadFile(std::string path);
+  bool CopySample(const unsigned int source_index, const std::size_t source_sample, Tensor& target, const std::size_t target_sample);
+private:
+  std::vector<CompressedTensor*> tensors_;
+  std::size_t max_elements_ = 0;
+  Tensor temp_tensor_;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/include/cn24/util/ConsoleStatSink.h b/include/cn24/util/ConsoleStatSink.h
new file mode 100644
index 0000000..a315d4d
--- /dev/null
+++ b/include/cn24/util/ConsoleStatSink.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */  
+/**
+ * @file ConsoleStatSink.h
+ * @brief Gets data from StatAggregator and processes it
+ *
+ * @author Clemens-Alexander Brust (ikosa dot de at gmail dot com)
+ */
+
+#ifndef CONV_CONSOLESTATSINK_H
+#define CONV_CONSOLESTATSINK_H
+
+#include <functional>
+#include <vector>
+#include <iomanip>
+
+#include "Config.h"
+#include "Log.h"
+
+#include "StatAggregator.h"
+#include "StatSink.h"
+
+namespace Conv
+{
+// Forward declaration
+class ConsoleStatSink : public StatSink {
+public:
+  virtual void Initialize(std::vector<StatDescriptor*>& stat_descriptors) {
+    stat_descriptors_ = stat_descriptors;
+    LOGDEBUG << "Initializing ConsoleStatSink. Registered Stats:";
+    for(unsigned int s = 0; s < stat_descriptors_.size(); s++) {
+      LOGDEBUG << " - " << stat_descriptors_[s]->description;
+    }
+  }
+  virtual void Process(HardcodedStats& hardcoded_stats, std::vector<Stat*>& stats) {
+    (hardcoded_stats.is_training ? LOGTRESULT : LOGRESULT) << "Stats for epoch " << hardcoded_stats.epoch << ":" << LOGRESULTEND;
+    for(unsigned int s = 0; s < stat_descriptors_.size(); s++) {
+      if(!stats[s]->is_null) {
+        (hardcoded_stats.is_training ? LOGTRESULT : LOGRESULT) << std::setw(32) << stat_descriptors_[s]->description << ": " << std::setw(24) << stats[s]->value << " " << stat_descriptors_[s]->unit << LOGRESULTEND;
+      }
+    }
+  }
+  virtual void SetCurrentExperiment(std::string current_experiment) {
+    LOGINFO << "Beginning Experiment: " << current_experiment;
+  }
+  
+private:
+  std::vector<StatDescriptor*> stat_descriptors_;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/include/cn24/util/Dataset.h b/include/cn24/util/Dataset.h
index 539c974..22754f0 100644
--- a/include/cn24/util/Dataset.h
+++ b/include/cn24/util/Dataset.h
@@ -18,6 +18,7 @@
 
 #include "Config.h"
 #include "Tensor.h"
+#include "TensorStream.h"
 
 namespace Conv
 {
@@ -214,8 +215,10 @@ class TensorStreamPatchDataset : public Dataset {
 
 class TensorStreamDataset : public Dataset {
 public:
-  TensorStreamDataset(std::istream& training_stream,
-    std::istream& testing_stream,
+  TensorStreamDataset(/*std::istream& training_stream,
+    std::istream& testing_stream,*/
+    TensorStream* training_stream,
+    TensorStream* testing_stream,
     unsigned int classes,
     std::vector<std::string> class_names,
     std::vector<unsigned int> class_colors,
@@ -244,8 +247,12 @@ class TensorStreamDataset : public Dataset {
   
 private:
   // Stored data
+  /*
   Tensor* data_ = nullptr;
   Tensor* labels_ = nullptr;
+  */
+  TensorStream* training_stream_;
+  TensorStream* testing_stream_;
   
   Tensor error_cache;
   
diff --git a/include/cn24/util/FloatTensorStream.h b/include/cn24/util/FloatTensorStream.h
new file mode 100644
index 0000000..b66ced2
--- /dev/null
+++ b/include/cn24/util/FloatTensorStream.h
@@ -0,0 +1,46 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */  
+
+#ifndef CONV_FLOATTENSORSTREAM_H
+#define CONV_FLOATTENSORSTREAM_H
+
+#include <cstddef>
+#include <string>
+#include <iostream>
+
+#include "Log.h"
+#include "Config.h"
+
+#include "Tensor.h"
+#include "TensorStream.h"
+
+namespace Conv {
+
+class FloatTensorStream : public TensorStream {
+public: 
+  
+  ~FloatTensorStream() {
+    for(Tensor* tensor: tensors_) {
+      delete tensor;
+    }
+  }
+  
+  // TensorStream implementations
+  std::size_t GetWidth(unsigned int index) { return index < tensors_.size() ? tensors_[index]->width() : 0; }
+  std::size_t GetHeight(unsigned int index) { return index < tensors_.size() ? tensors_[index]->height() : 0; }
+  std::size_t GetMaps(unsigned int index) { return index < tensors_.size() ? tensors_[index]->maps() : 0; }
+  std::size_t GetSamples(unsigned int index) { return index < tensors_.size() ? tensors_[index]->samples() : 0; }
+  unsigned int GetTensorCount() { return tensors_.size(); }
+  unsigned int LoadFile(std::string path);
+  bool CopySample(const unsigned int source_index, const std::size_t source_sample, Tensor& target, const std::size_t target_sample);
+private:
+  std::vector<Tensor*> tensors_;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/include/cn24/util/GradientTester.h b/include/cn24/util/GradientTester.h
index 5bd20b3..aed9529 100644
--- a/include/cn24/util/GradientTester.h
+++ b/include/cn24/util/GradientTester.h
@@ -22,9 +22,9 @@ class GradientTester {
   /**
    * @brief Tests the gradients computed by the net numerically
    *
-   * Only call this function on nets with a constant input!
+   * Only call this function on nets with a constant input, not a DatasetInputLayer!
    */
-  static void TestGradient(NetGraph& net);
+  static void TestGradient(NetGraph& net, unsigned int skip_weights = 0, bool fatal_fail = false);
 };
   
   
diff --git a/include/cn24/util/Init.h b/include/cn24/util/Init.h
index 45a4b19..3bd8b26 100644
--- a/include/cn24/util/Init.h
+++ b/include/cn24/util/Init.h
@@ -6,7 +6,8 @@
  */  
 /**
  * @file Init.h
- * @brief Provides initialization functions for several subsystems
+ * @brief Provides initialization functions for several subsystems and a
+ *        singleton "System" class for global objects.
  *
  * @author Clemens-Alexander Brust (ikosa dot de at gmail dot com)
  */
@@ -18,11 +19,13 @@
 
 namespace Conv {
 class TensorViewer;
+class StatAggregator;
 class System {
 public:
   static void Init(int requested_log_level = -1);
   static void GetExecutablePath(std::string& binary_path);
   static TensorViewer* viewer;
+  static StatAggregator* stat_aggregator;
   static int log_level;
 };
 }
diff --git a/include/cn24/util/StatAggregator.h b/include/cn24/util/StatAggregator.h
new file mode 100644
index 0000000..765dab9
--- /dev/null
+++ b/include/cn24/util/StatAggregator.h
@@ -0,0 +1,107 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */  
+/**
+ * @file StatAggregator.h
+ * @brief Collects data from various sources and aggregates them into a statistic
+ *
+ * @author Clemens-Alexander Brust (ikosa dot de at gmail dot com)
+ */
+
+#ifndef CONV_STATAGGREGATOR_H
+#define CONV_STATAGGREGATOR_H
+
+#include <functional>
+#include <vector>
+#include <string>
+#include <chrono>
+#include <climits>
+
+#include "Config.h"
+
+namespace Conv
+{
+// Forward declarations
+class StatSink;
+class Trainer;
+class NetStatus;
+
+// Hardcoded stats
+struct HardcodedStats {
+  double seconds_elapsed = 0.0;
+  unsigned long iterations = 0UL;
+  unsigned long weights = 0UL;
+  unsigned long epoch = 0UL;
+  bool is_training = false;
+  std::string current_experiment = "unnamed";
+  
+  void Reset() {
+    seconds_elapsed = 0.0;
+    iterations = 0UL;
+    weights = 0UL;
+  }
+};
+
+struct Stat {
+  double value = 0.0;
+  bool is_null = false;
+};
+
+struct StatDescriptor {
+  bool nullable = false;
+  std::string description = "";
+  std::string unit = "";
+  
+  // Lambdas for processing
+  std::function<void(Stat&)> init_function = [] (Stat& stat) {};
+  std::function<void(Stat&,double)> update_function = [] (Stat& stat, double user_value) {};
+  std::function<Stat(HardcodedStats&, Stat&)> output_function =
+    [] (HardcodedStats& hc_stats, Stat& stat) -> Stat {return stat;};
+    
+  // For easy access
+  unsigned int stat_id = UINT_MAX;
+};
+
+class StatAggregator {
+  friend class Trainer;
+  friend class NetStatus;
+public:
+  unsigned int RegisterStat(StatDescriptor* stat_descriptor);
+  unsigned int RegisterSink(StatSink* stat_sink);
+  void Initialize();
+  
+  void Update(unsigned int stat_id, double user_value);
+  void Generate();
+  
+  void StartRecording();
+  void StopRecording();
+  void Reset();
+
+  void Snapshot();
+  
+  void SetCurrentExperiment(std::string current_experiment);
+private:
+  // State
+  enum StatAggregatorState {
+    STOPPED, RECORDING, INIT } state_ = INIT;
+  std::chrono::time_point<std::chrono::system_clock> start_time_;
+  
+  // Stats
+  HardcodedStats hardcoded_stats_;
+  std::vector<Stat> stats_;
+  
+  // Descriptors
+  std::vector<StatDescriptor*> stat_descriptors_;
+  unsigned int stat_descriptor_count_ = 0;
+  
+  // Sinks
+  std::vector<StatSink*> stat_sinks_;
+  unsigned int stat_sink_count_ = 0;
+};  
+
+}
+
+#endif
\ No newline at end of file
diff --git a/include/cn24/util/StatSink.h b/include/cn24/util/StatSink.h
new file mode 100644
index 0000000..80c5031
--- /dev/null
+++ b/include/cn24/util/StatSink.h
@@ -0,0 +1,36 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */  
+/**
+ * @file StatSink.h
+ * @brief Gets data from StatAggregator and processes it
+ *
+ * @author Clemens-Alexander Brust (ikosa dot de at gmail dot com)
+ */
+
+#ifndef CONV_STATSINK_H
+#define CONV_STATSINK_H
+
+#include <functional>
+#include <vector>
+
+#include "Config.h"
+
+#include "StatAggregator.h"
+
+namespace Conv
+{
+// Forward declaration
+class StatSink {
+public:
+  virtual void Initialize(std::vector<StatDescriptor*>& stat_descriptors) = 0;
+  virtual void SetCurrentExperiment(std::string current_experiment) = 0;
+  virtual void Process(HardcodedStats& hardcoded_stats, std::vector<Stat*>& stats) = 0;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/include/cn24/util/Tensor.h b/include/cn24/util/Tensor.h
index 189098c..7e1797b 100644
--- a/include/cn24/util/Tensor.h
+++ b/include/cn24/util/Tensor.h
@@ -104,7 +104,8 @@ class Tensor {
    */
   void Resize (const std::size_t samples, const std::size_t width = 1,
                const std::size_t height = 1, const std::size_t maps = 1,
-               datum* const preallocated_memory = nullptr, bool mmapped = false );
+               datum* const preallocated_memory = nullptr, bool mmapped = false,
+               bool dont_delete = false);
 
   /**
    * @brief Resizes the Tensor to match another Tensor's size.
diff --git a/include/cn24/util/TensorStream.h b/include/cn24/util/TensorStream.h
new file mode 100644
index 0000000..4b1d69a
--- /dev/null
+++ b/include/cn24/util/TensorStream.h
@@ -0,0 +1,62 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */  
+
+#ifndef CONV_TENSORSTREAM_H
+#define CONV_TENSORSTREAM_H
+
+#include <cstddef>
+#include <string>
+#include <iostream>
+#include <vector>
+
+#include "Log.h"
+#include "Config.h"
+
+#include "Tensor.h"
+
+namespace Conv {
+
+class TensorStream {
+public:
+  virtual std::size_t GetWidth(unsigned int index) = 0;
+  virtual std::size_t GetHeight(unsigned int index) = 0;
+  virtual std::size_t GetMaps(unsigned int index) = 0;
+  virtual std::size_t GetSamples(unsigned int index) = 0;
+  virtual unsigned int LoadFile(std::string path) = 0;
+  
+  virtual bool CopySample(const unsigned int source, const std::size_t source_sample,
+                          Tensor& target, const std::size_t target_sample) = 0;
+  
+  virtual unsigned int GetTensorCount() = 0;
+  
+  static TensorStream* FromFile(std::string path);
+};
+
+}
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/include/private/CLHelper.h b/include/private/CLHelper.h
index 0d4619f..ae4eb4d 100644
--- a/include/private/CLHelper.h
+++ b/include/private/CLHelper.h
@@ -64,6 +64,8 @@ class CLHelper {
   static cl_kernel k_sms;
   static cl_kernel k_im2col;
   static cl_kernel k_col2im;
+  static cl_kernel k_up;
+  static cl_kernel k_down;
 #endif
 };
   
diff --git a/kernels/scaling.cl b/kernels/scaling.cl
new file mode 100644
index 0000000..f81c1e8
--- /dev/null
+++ b/kernels/scaling.cl
@@ -0,0 +1,69 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */  
+__kernel void DOWN ( __global float* X,
+                   __global float* Y,
+                   uint target_width,
+                   uint target_height,
+                   uint source_width,
+                   uint source_height,
+                   uint region_width,
+                   uint region_height,
+                   float target_factor)
+{
+  uint target_x = get_global_id(0);
+  uint target_y = get_global_id(1);
+  uint target_skid = get_global_id(2);
+  
+  uint source_x = target_x * region_width;
+  uint source_y = target_y * region_height;
+  
+  uint X_sk = source_width * source_height * target_skid;
+  
+  float sum = 0.0;
+  for(uint ry = 0; ry < region_height; ry++) {
+    const uint X_line = X_sk + (source_width * (source_y + ry));
+    for(uint rx = 0; rx < region_width; rx++) {
+      const uint X_idx = X_line + source_x + rx;
+      const float X_val = X[X_idx];
+      sum += X_val;
+    }
+  }
+  
+  uint Y_sk = target_width * target_height * target_skid;
+  uint Y_line = Y_sk + (target_width * target_y);
+  uint Y_idx = Y_line + target_x;
+  Y[Y_idx] = sum * target_factor;
+}
+
+
+__kernel void UP ( __global float* X,
+                   __global float* Y,
+                   uint target_width,
+                   uint target_height,
+                   uint source_width,
+                   uint source_height,
+                   uint region_width,
+                   uint region_height,
+                   float target_factor)
+{
+  uint target_x = get_global_id(0);
+  uint target_y = get_global_id(1);
+  uint target_skid = get_global_id(2);
+  
+  uint source_x = target_x / region_width;
+  uint source_y = target_y / region_height;
+  
+  uint X_sk = source_width * source_height * target_skid;
+  const uint X_line = X_sk + (source_width * source_y);
+  const uint X_idx = X_line + source_x;
+  const float X_val = X[X_idx];
+  
+  uint Y_sk = target_width * target_height * target_skid;
+  uint Y_line = Y_sk + (target_width * target_y);
+  uint Y_idx = Y_line + target_x;
+  Y[Y_idx] = X_val * target_factor;
+}
\ No newline at end of file
diff --git a/scripts/runexperiments.sh b/scripts/runexperiments.sh
index 5acbd9b..0adf25d 100755
--- a/scripts/runexperiments.sh
+++ b/scripts/runexperiments.sh
@@ -9,7 +9,7 @@ mkdir tmp 2&> /dev/null
 mkdir logs 2&> /dev/null
 mkdir csv 2&> /dev/null
 
-echo "Running $ITERATIONS iterations ($EPOCHS epochs each) of network $NETFILE on dataset $DATASET..."
+echo "Running $ITERATIONS iterations ($EPOCHS epochs each, testing every 10th epoch) of network $NETFILE on dataset $DATASET..."
 
 SIGNATURE=$(basename "$DATASET")_$(basename "$NETFILE")_${EPOCHS}_${ITERATIONS}_$TIMESTAMP
 SCRFILE=tmp/scr_$SIGNATURE
@@ -17,7 +17,8 @@ LOGFILE=tmp/log_$SIGNATURE
 OLOGFILE=logs/log_$SIGNATURE
 CSVFILE=csv/csv_$SIGNATURE
 
-echo "reset" > $SCRFILE
+echo "set experiment name=$SIGNATURE" > $SCRFILE
+echo "reset" >> $SCRFILE
 echo "set epoch=0" >> $SCRFILE
 
 for i in $(seq 1 $ITERATIONS)
@@ -25,6 +26,9 @@ do
   for j in $(seq 1 $EPOCHS)
   do
     MODELFILE=tmp/model_${SIGNATURE}_i${i}_j$j
+    echo "tstat enable=0" >> $SCRFILE
+    echo "train epochs=9" >> $SCRFILE
+    echo "tstat enable=1" >> $SCRFILE
     echo "train" >> $SCRFILE
     echo "save file=$MODELFILE" >> $SCRFILE
     echo "test" >> $SCRFILE
@@ -34,10 +38,10 @@ do
 done
 
 
-./trainNetwork $DATASET $NETFILE $SCRFILE 2&> $LOGFILE
+./trainNetwork -v $DATASET $NETFILE $SCRFILE 2&> $LOGFILE
 
 mv $LOGFILE $OLOGFILE
 echo "Training done, output log file: $OLOGFILE"
 
-./logtocsv_multiclass.sh $OLOGFILE > $CSVFILE
-echo "Output CSV file: $CSVFILE"
+#./logtocsv_multiclass.sh $OLOGFILE > $CSVFILE
+#echo "Output CSV file: $CSVFILE"
diff --git a/scripts/runexperimentswithmodel.sh b/scripts/runexperimentswithmodel.sh
new file mode 100755
index 0000000..de907ba
--- /dev/null
+++ b/scripts/runexperimentswithmodel.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+DATASET=$1
+NETFILE=$2
+EPOCHS=$3
+ITERATIONS=$4
+MODEL=$5
+TIMESTAMP=`date +%s`
+
+mkdir tmp 2&> /dev/null
+mkdir logs 2&> /dev/null
+mkdir csv 2&> /dev/null
+
+echo "Running $ITERATIONS iterations ($EPOCHS * 10 epochs each, testing every 10th epoch) of network $NETFILE on dataset $DATASET, loading model $MODEL..."
+
+SIGNATURE=$(basename "$DATASET")_$(basename "$NETFILE")_model${MODEL}_${EPOCHS}_${ITERATIONS}_$TIMESTAMP
+SCRFILE=tmp/scr_$SIGNATURE
+LOGFILE=tmp/log_$SIGNATURE
+OLOGFILE=logs/log_$SIGNATURE
+CSVFILE=csv/csv_$SIGNATURE
+
+echo "set experiment name=$SIGNATURE" > $SCRFILE
+echo "reset" >> $SCRFILE
+echo "load file=$MODEL" >> $SCRFILE
+echo "set epoch=0" >> $SCRFILE
+
+for i in $(seq 1 $ITERATIONS)
+do
+  for j in $(seq 1 $EPOCHS)
+  do
+    MODELFILE=tmp/model_${SIGNATURE}_i${i}_j$j
+    echo "tstat enable=0" >> $SCRFILE
+    echo "train epochs=9" >> $SCRFILE
+    echo "tstat enable=1" >> $SCRFILE
+    echo "train" >> $SCRFILE
+    echo "save file=$MODELFILE" >> $SCRFILE
+    echo "test" >> $SCRFILE
+  done
+  echo "reset" >> $SCRFILE
+  echo "load file=$MODEL" >> $SCRFILE
+  echo "set epoch=0" >> $SCRFILE
+done
+
+
+./trainNetwork -v $DATASET $NETFILE $SCRFILE 2&> $LOGFILE
+
+mv $LOGFILE $OLOGFILE
+echo "Training done, output log file: $OLOGFILE"
diff --git a/src/factory/ConfigurableFactory.cpp b/src/factory/ConfigurableFactory.cpp
index f62efcf..e8ab9b1 100644
--- a/src/factory/ConfigurableFactory.cpp
+++ b/src/factory/ConfigurableFactory.cpp
@@ -13,10 +13,12 @@
 #include "ResizeLayer.h"
 #include "MaxPoolingLayer.h"
 #include "AdvancedMaxPoolingLayer.h"
+#include "InputDownSamplingLayer.h"
 #include "NonLinearityLayer.h"
 #include "UpscaleLayer.h"
 #include "SpatialPriorLayer.h"
 #include "ConcatenationLayer.h"
+#include "SumLayer.h"
 #include "ConfigParsing.h"
 #include "NetGraph.h"
 
@@ -103,6 +105,14 @@ ConfigurableFactory::ConfigurableFactory (std::istream& file, const unsigned int
         factory *= ky;
       }
 
+      if (StartsWithIdentifier (line, "downsampling")) {
+        unsigned int kx, ky;
+        ParseKernelSizeIfPossible (line, "size", kx, ky);
+        LOGDEBUG << "Adding down-sampling layer to receptive field (" << kx << "," << ky << ")";
+        factorx *= kx;
+        factory *= ky;
+      }
+      
       if (StartsWithIdentifier (line, "amaxpooling")) {
         unsigned int kx, ky, sx, sy;
         ParseKernelSizeIfPossible (line, "size", kx, ky);
@@ -116,6 +126,9 @@ ConfigurableFactory::ConfigurableFactory (std::istream& file, const unsigned int
       }
     }
   }
+  
+  LOGDEBUG << "To achieve this receptive field size manually, start net config with manual rfx=" << receptive_field_x_
+    << " rfy=" << receptive_field_y_ << " factorx=" << factorx << " factory=" << factory;
 
   if (method_ == PATCH) {
     receptive_field_x_ += factorx;
@@ -479,11 +492,11 @@ bool ConfigurableFactory::AddLayers(NetGraph& net, NetGraphConnection data_layer
     }
     
     if (line.compare(0, 4, "popa") == 0) {
-      last_connection = stack_a[stack_a_pos];
+      last_connection = stack_a[stack_a_pos--];
     }
     
     if (line.compare(0, 4, "popb") == 0) {
-      last_connection = stack_b[stack_b_pos];
+      last_connection = stack_b[stack_b_pos--];
     }
     
     /*
@@ -555,6 +568,19 @@ bool ConfigurableFactory::AddLayers(NetGraph& net, NetGraphConnection data_layer
 				last_connection.backprop = true;
       }
       
+      if (StartsWithIdentifier (line, "downsampling")) {
+        unsigned int kx = 1, ky = 1;
+        ParseKernelSizeIfPossible (line, "size", kx, ky);
+
+        InputDownSamplingLayer* mp = new InputDownSamplingLayer (kx, ky);
+
+        NetGraphNode* node = new NetGraphNode(mp, last_connection);
+        net.AddNode(node);
+        last_connection.buffer = 0;
+        last_connection.node = node;
+        last_connection.backprop = false;
+      }
+      
       if (StartsWithIdentifier (line, "amaxpooling")) {
         unsigned int kx = 1, ky = 1, sx, sy;
         ParseKernelSizeIfPossible (line, "size", kx, ky);
@@ -644,6 +670,46 @@ bool ConfigurableFactory::AddLayers(NetGraph& net, NetGraphConnection data_layer
 				last_connection.node = node;
 				last_connection.backprop = true;
 			}
+			
+      if (StartsWithIdentifier(line, "sum")){
+        std::string stack_name;
+        NetGraphConnection* stack_ptr;
+        int stack_pos;
+        ParseStringParamIfPossible(line, "stack", stack_name);
+        if (stack_name.compare(0, 1, "b") == 0){
+          stack_ptr = stack_b;
+          stack_pos = stack_b_pos;
+        }
+        else {
+          stack_ptr = stack_a;
+          stack_pos = stack_a_pos;
+        }
+        SumLayer* l = new SumLayer();
+        NetGraphNode* node = new NetGraphNode(l);
+        for (int p = stack_pos; p >= 0; p--) {
+          node->input_connections.push_back(stack_ptr[p]);
+        }
+        net.AddNode(node);
+        last_connection.buffer = 0;
+        last_connection.node = node;
+        last_connection.backprop = true;
+      }
+      
+      if (StartsWithIdentifier(line, "upscale")){
+        unsigned int ufx = 1, ufy = 1;
+        unsigned int o = 0;
+        ParseKernelSizeIfPossible(line, "factor", ufx, ufy);
+        ParseCountIfPossible(line, "is_output", o);
+        UpscaleLayer* l = new UpscaleLayer(ufx, ufy);
+        NetGraphNode* node = new NetGraphNode(l, last_connection);
+        node->is_output = (o == 1);
+        net.AddNode(node);
+        last_connection.buffer = 0;
+        last_connection.node = node;
+        last_connection.backprop = true; 
+        is_output = (o == 1);
+        already_upscaled = true;
+      }
 
 			if (is_output && !already_upscaled && method_ == FCN && (factorx != 1 || factory != 1)) {
 				UpscaleLayer* l = new UpscaleLayer(factorx, factory);
diff --git a/src/factory/SkipLayerNetworkFactory.cpp b/src/factory/SkipLayerNetworkFactory.cpp
new file mode 100644
index 0000000..2e49b44
--- /dev/null
+++ b/src/factory/SkipLayerNetworkFactory.cpp
@@ -0,0 +1,69 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */
+#include <cstdio>
+
+#include "ErrorLayer.h"
+
+#include "ConvolutionLayer.h"
+#include "LocalResponseNormalizationLayer.h"
+#include "ResizeLayer.h"
+#include "MaxPoolingLayer.h"
+#include "AdvancedMaxPoolingLayer.h"
+#include "InputDownSamplingLayer.h"
+#include "NonLinearityLayer.h"
+#include "UpscaleLayer.h"
+#include "SpatialPriorLayer.h"
+#include "ConcatenationLayer.h"
+#include "ConfigParsing.h"
+#include "NetGraph.h"
+
+#include "SkipLayerNetworkFactory.h"
+
+namespace Conv {
+
+bool SkipLayerNetworkFactory::AddLayers(NetGraph& graph, NetGraphConnection data_layer_connection, const unsigned int output_classes, bool add_loss_layer)
+{
+  return false;
+}
+
+int SkipLayerNetworkFactory::AddLayers(Net& net, Connection data_layer_connection, const unsigned int output_classes, bool add_loss_layer, std::ostream& graph_output)
+{
+  return 0;
+}
+
+Layer* SkipLayerNetworkFactory::CreateLossLayer(const unsigned int output_classes, const datum loss_weight)
+{
+  return nullptr;
+}
+
+void SkipLayerNetworkFactory::InitOptimalSettings()
+{
+  
+}
+
+Method SkipLayerNetworkFactory::method() const
+{
+  return Method::FCN;
+}
+
+TrainerSettings SkipLayerNetworkFactory::optimal_settings() const
+{
+  TrainerSettings s;
+  return s;
+}
+
+int SkipLayerNetworkFactory::patchsizex()
+{
+  return 0;
+}
+
+int SkipLayerNetworkFactory::patchsizey()
+{
+  return 0;
+}
+
+}
diff --git a/src/math/TensorMath.cpp b/src/math/TensorMath.cpp
index 7d7cb25..02f0999 100644
--- a/src/math/TensorMath.cpp
+++ b/src/math/TensorMath.cpp
@@ -451,4 +451,180 @@ void TensorMath::SMS(const Tensor& source, Tensor& target)
   target.hint_ignore_content_ = false;
 }
 
+void TensorMath::DOWN(const Tensor& source, Tensor& target, const int region_width, const int region_height, const datum target_factor)
+{
+#ifdef BUILD_OPENCL
+  if(source.cl_gpu_ || target.cl_gpu_) {
+    ((Tensor&)source).MoveToGPU();
+    target.MoveToGPU(true);
+    const int target_width = target.width();
+    const int target_height = target.height();
+    const int source_width = source.width();
+    const int source_height = source.height();
+    const int maps = target.maps();
+    const int samples = target.samples();
+    cl_uint error = 0;
+
+    error |= clSetKernelArg (CLHelper::k_down, 0, sizeof (cl_mem), &(((Tensor&)source).cl_data_ptr_));
+    error |= clSetKernelArg (CLHelper::k_down, 1, sizeof (cl_mem), &(target.cl_data_ptr_));
+    error |= clSetKernelArg (CLHelper::k_down, 2, sizeof (cl_uint), &target_width);
+    error |= clSetKernelArg (CLHelper::k_down, 3, sizeof (cl_uint), &target_height);
+    error |= clSetKernelArg (CLHelper::k_down, 4, sizeof (cl_uint), &source_width);
+    error |= clSetKernelArg (CLHelper::k_down, 5, sizeof (cl_uint), &source_height);
+    error |= clSetKernelArg (CLHelper::k_down, 6, sizeof (cl_uint), &region_width);
+    error |= clSetKernelArg (CLHelper::k_down, 7, sizeof (cl_uint), &region_height);
+    error |= clSetKernelArg (CLHelper::k_down, 8, sizeof (cl_float), &target_factor);
+
+    if (error != CL_SUCCESS) {
+      FATAL("Error setting kernel args: " << (signed int) error);
+    }
+
+    size_t global_work_size[] = {(size_t)target.width(), (size_t)target.height(), (size_t)(target.maps() * target.samples())};
+
+    error = clEnqueueNDRangeKernel (CLHelper::queue, CLHelper::k_down, 3, NULL,
+        global_work_size, NULL, 0, NULL, NULL);
+    if (error != CL_SUCCESS) {
+      FATAL("Error enqueueing kernel: " << (signed int) error);
+    }
+
+#ifdef BRUTAL_FINISH
+    error = clFinish (CLHelper::queue);
+    if (error != CL_SUCCESS) {
+      FATAL("Error finishing command queue: " << (signed int) error);
+    }
+#endif
+  } else {
+#endif
+    const int target_width = target.width();
+    const int target_height = target.height();
+    const int maps = target.maps();
+    const int samples = target.samples();
+    for(int sample = 0; sample < samples; sample++) {
+      for(int map = 0; map < maps; map++) {
+        for(unsigned int target_y = 0; target_y < target_height; target_y++) {
+          const unsigned int source_y = region_height * target_y;
+          for(unsigned int target_x = 0; target_x < target_width; target_x++) {
+            const unsigned int source_x = region_width * target_x;
+            datum sum = 0;
+            for(unsigned int ry = 0; ry < region_height; ry++) {
+              for(unsigned int rx = 0; rx < region_width; rx++) {
+                const datum* src = source.data_ptr_const(source_x + rx, source_y + ry, map, sample);
+                sum += *src;
+              }
+            }
+            datum* tgt = target.data_ptr(target_x, target_y, map, sample);
+            *tgt = sum * target_factor;
+          }
+        }
+      }
+    }
+    
+#ifdef BUILD_OPENCL
+  }
+#endif
+
+  target.hint_ignore_content_ = false;
+}
+
+void TensorMath::UP(const Tensor& source, Tensor& target, const int region_width, const int region_height, const datum target_factor)
+{
+#ifdef BUILD_OPENCL
+  if(source.cl_gpu_ || target.cl_gpu_) {
+    ((Tensor&)source).MoveToGPU();
+    target.MoveToGPU(true);
+    const int target_width = target.width();
+    const int target_height = target.height();
+    const int source_width = source.width();
+    const int source_height = source.height();
+    const int maps = target.maps();
+    const int samples = target.samples();
+    cl_uint error = 0;
+
+    error |= clSetKernelArg (CLHelper::k_up, 0, sizeof (cl_mem), &(((Tensor&)source).cl_data_ptr_));
+    error |= clSetKernelArg (CLHelper::k_up, 1, sizeof (cl_mem), &(target.cl_data_ptr_));
+    error |= clSetKernelArg (CLHelper::k_up, 2, sizeof (cl_uint), &target_width);
+    error |= clSetKernelArg (CLHelper::k_up, 3, sizeof (cl_uint), &target_height);
+    error |= clSetKernelArg (CLHelper::k_up, 4, sizeof (cl_uint), &source_width);
+    error |= clSetKernelArg (CLHelper::k_up, 5, sizeof (cl_uint), &source_height);
+    error |= clSetKernelArg (CLHelper::k_up, 6, sizeof (cl_uint), &region_width);
+    error |= clSetKernelArg (CLHelper::k_up, 7, sizeof (cl_uint), &region_height);
+    error |= clSetKernelArg (CLHelper::k_up, 8, sizeof (cl_float), &target_factor);
+
+    if (error != CL_SUCCESS) {
+      FATAL("Error setting kernel args: " << (signed int) error);
+    }
+
+    size_t global_work_size[] = {(size_t)target.width(), (size_t)target.height(), (size_t)(target.maps() * target.samples())};
+
+    error = clEnqueueNDRangeKernel (CLHelper::queue, CLHelper::k_up, 3, NULL,
+        global_work_size, NULL, 0, NULL, NULL);
+    if (error != CL_SUCCESS) {
+      FATAL("Error enqueueing kernel: " << (signed int) error);
+    }
+
+#ifdef BRUTAL_FINISH
+    error = clFinish (CLHelper::queue);
+    if (error != CL_SUCCESS) {
+      FATAL("Error finishing command queue: " << (signed int) error);
+    }
+#endif
+  } else {
+#endif
+    const datum region_area = (datum)region_width * (datum)region_height;
+    const int width = source.width();
+    const int height = source.height();
+    const int maps = source.maps();
+    const int samples = source.samples();
+    for(int sample = 0; sample < samples; sample++) {
+      for(int map = 0; map < maps; map++) {
+        for(unsigned int y = 0; y < height; y++) {
+          const unsigned int iy = region_height * y;
+          for(unsigned int x = 0; x < width; x++) {
+            const unsigned int ix = region_width * x;
+            const datum* src = source.data_ptr_const(x, y, map, sample);
+            datum sum = *src;
+            for(unsigned int ry = 0; ry < region_height; ry++) {
+              for(unsigned int rx = 0; rx < region_width; rx++) {
+                datum* tgt = target.data_ptr(ix + rx, iy + ry, map, sample);
+                *tgt = sum * target_factor;
+              }
+            }
+          }
+        }
+      }
+    }
+#ifdef BUILD_OPENCL
+  }
+#endif
+
+  target.hint_ignore_content_ = false;
+}
+
+void TensorMath::ADD(const Tensor& source_a, const Tensor& source_b, Tensor& target)
+{
+#ifdef BUILD_OPENCL
+  ((Tensor&)source_a).MoveToCPU();
+  ((Tensor&)source_b).MoveToCPU();
+  target.MoveToCPU(true);
+#endif
+  if((source_a.samples() != source_b.samples())
+    || (source_b.samples() != target.samples())
+    || (source_a.elements() != source_b.elements())
+    || (source_b.elements() != target.elements())) {
+    FATAL("Dimensions don't match!");
+  }
+  
+  #pragma omp parallel for default(shared)
+  for(unsigned int element = 0; element < source_a.elements(); element++) {
+    const datum* source_a_ptr = &(source_a.data_ptr_const()[element]);
+    const datum* source_b_ptr = &(source_b.data_ptr_const()[element]);
+    datum* target_ptr = &(target.data_ptr()[element]);
+    
+    *target_ptr = *source_a_ptr + *source_b_ptr;
+  }
+  
+  target.hint_ignore_content_ = false;
+}
+
+
 }
diff --git a/src/net/BinaryStatLayer.cpp b/src/net/BinaryStatLayer.cpp
index 895aa8a..d286f5f 100644
--- a/src/net/BinaryStatLayer.cpp
+++ b/src/net/BinaryStatLayer.cpp
@@ -6,6 +6,7 @@
  */  
 #include "Log.h"
 #include "Init.h"
+#include "StatAggregator.h"
 
 #include "BinaryStatLayer.h"
 
@@ -37,6 +38,132 @@ BinaryStatLayer::BinaryStatLayer ( const unsigned int thresholds,
   }
 
   Reset();
+
+  // Initialize stat descriptors
+  stat_fpr_ = new StatDescriptor;
+  stat_fnr_ = new StatDescriptor;
+  stat_pre_ = new StatDescriptor;
+  stat_rec_ = new StatDescriptor;
+  stat_acc_ = new StatDescriptor;
+  stat_f1_ = new StatDescriptor;
+
+  stat_fpr_->description = "False Positive Rate";
+  stat_fpr_->unit = "%";
+  stat_fpr_->nullable = true;
+  stat_fpr_->init_function = [this] (Stat& stat) { stat.is_null = true; stat.value = 0; Reset(); };
+  stat_fpr_->update_function = [] (Stat& stat, double user_value) { stat.is_null = false; stat.value = user_value; };
+  stat_fpr_->output_function = [] (HardcodedStats& hc_stats, Stat& stat) -> Stat { return stat; };
+
+  stat_fnr_->description = "False Negative Rate";
+  stat_fnr_->unit = "%";
+  stat_fnr_->nullable = true;
+  stat_fnr_->init_function = [] (Stat& stat) { stat.is_null = true; stat.value = 0; };
+  stat_fnr_->update_function = [] (Stat& stat, double user_value) { stat.is_null = false; stat.value = user_value; };
+  stat_fnr_->output_function = [] (HardcodedStats& hc_stats, Stat& stat) -> Stat { return stat; };
+
+  stat_pre_->description = "Precision";
+  stat_pre_->unit = "%";
+  stat_pre_->nullable = true;
+  stat_pre_->init_function = [] (Stat& stat) { stat.is_null = true; stat.value = 0; };
+  stat_pre_->update_function = [] (Stat& stat, double user_value) { stat.is_null = false; stat.value = user_value; };
+  stat_pre_->output_function = [] (HardcodedStats& hc_stats, Stat& stat) -> Stat { return stat; };
+
+  stat_rec_->description = "Recall";
+  stat_rec_->unit = "%";
+  stat_rec_->nullable = true;
+  stat_rec_->init_function = [] (Stat& stat) { stat.is_null = true; stat.value = 0; };
+  stat_rec_->update_function = [] (Stat& stat, double user_value) { stat.is_null = false; stat.value = user_value; };
+  stat_rec_->output_function = [] (HardcodedStats& hc_stats, Stat& stat) -> Stat { return stat; };
+
+  stat_acc_->description = "Accuracy";
+  stat_acc_->unit = "%";
+  stat_acc_->nullable = true;
+  stat_acc_->init_function = [] (Stat& stat) { stat.is_null = true; stat.value = 0; };
+  stat_acc_->update_function = [] (Stat& stat, double user_value) { stat.is_null = false; stat.value = user_value; };
+  stat_acc_->output_function = [] (HardcodedStats& hc_stats, Stat& stat) -> Stat { return stat; };
+
+  stat_f1_->description = "F1 Value";
+  stat_f1_->unit = "%";
+  stat_f1_->nullable = true;
+  stat_f1_->init_function = [] (Stat& stat) { stat.is_null = true; stat.value = 0; };
+  stat_f1_->update_function = [] (Stat& stat, double user_value) { stat.is_null = false; stat.value = user_value; };
+  stat_f1_->output_function = [] (HardcodedStats& hc_stats, Stat& stat) -> Stat { return stat; };
+
+  // Register stats
+  System::stat_aggregator->RegisterStat(stat_fpr_);
+  System::stat_aggregator->RegisterStat(stat_fnr_);
+  System::stat_aggregator->RegisterStat(stat_pre_);
+  System::stat_aggregator->RegisterStat(stat_rec_);
+  System::stat_aggregator->RegisterStat(stat_acc_);
+  System::stat_aggregator->RegisterStat(stat_f1_);
+}
+
+void BinaryStatLayer::UpdateAll() {
+  // Calculate metrics
+  datum fmax = -2;
+  unsigned int tfmax = -1;
+
+  for ( unsigned int t = 0; t < thresholds_; t++ ) {
+    datum precision = -1;
+    datum recall = -1;
+    datum f1 = -1;
+
+    if ( ( true_positives_[t] + false_positives_[t] ) > 0 )
+      precision = ( true_positives_[t] ) /
+                  ( true_positives_[t] + false_positives_[t] );
+
+    if ( ( true_positives_[t] + false_negatives_[t] ) > 0 )
+      recall = ( true_positives_[t] ) /
+               ( true_positives_[t] + false_negatives_[t] );
+
+    if ( precision >= 0 && recall >= 0 ) {
+      f1 = 2 * precision * recall / ( precision + recall );
+    }
+
+    if ( f1 > fmax ) {
+      fmax = f1;
+      tfmax = t;
+    }
+  }
+
+  datum fpr = -1;
+  datum fnr = -1;
+  datum precision = -1;
+  datum recall = -1;
+  datum f1 = -1;
+  datum acc = -1;
+
+  if ( ( true_positives_[tfmax] + false_positives_[tfmax] ) > 0 )
+    precision = ( true_positives_[tfmax] ) /
+                ( true_positives_[tfmax] + false_positives_[tfmax] );
+
+  if ( ( true_positives_[tfmax] + false_negatives_[tfmax] ) > 0 )
+    recall = ( true_positives_[tfmax] ) /
+             ( true_positives_[tfmax] + false_negatives_[tfmax] );
+
+  if ( ( false_positives_[tfmax] + true_negatives_[tfmax] ) > 0 )
+    fpr = ( false_positives_[tfmax] ) /
+          ( false_positives_[tfmax] + true_negatives_[tfmax] );
+
+  if ( ( true_positives_[tfmax] + false_negatives_[tfmax] ) > 0 )
+    fnr = ( false_negatives_[tfmax] ) /
+          ( true_positives_[tfmax] + false_negatives_[tfmax] );
+
+  if ( precision >= 0 && recall >= 0 )
+    f1 = 2 * precision * recall / ( precision + recall );
+  
+  acc = ( true_positives_[tfmax] + true_negatives_[tfmax] ) /
+          ( true_positives_[tfmax] + true_negatives_[tfmax] +
+            false_negatives_[tfmax] + false_positives_[tfmax]
+          );
+
+  // Update stats
+  if(fpr >= 0) System::stat_aggregator->Update(stat_fpr_->stat_id, 100.0 * fpr);
+  if(fnr >= 0) System::stat_aggregator->Update(stat_fnr_->stat_id, 100.0 * fnr);
+  if(precision >= 0) System::stat_aggregator->Update(stat_pre_->stat_id, 100.0 * precision);
+  if(recall >= 0) System::stat_aggregator->Update(stat_rec_->stat_id, 100.0 * recall);
+  if(acc >= 0) System::stat_aggregator->Update(stat_acc_->stat_id, 100.0 * acc);
+  if(f1 >= 0) System::stat_aggregator->Update(stat_f1_->stat_id, 100.0 * f1);
 }
 
 bool BinaryStatLayer::CreateOutputs ( const std::vector< CombinedTensor* >& inputs, std::vector< CombinedTensor* >& outputs ) {
@@ -141,85 +268,7 @@ void BinaryStatLayer::Reset() {
 }
 
 void BinaryStatLayer::Print ( std::string prefix, bool training ) {
-  datum fmax = -2;
-  unsigned int tfmax = -1;
-
-  for ( unsigned int t = 0; t < thresholds_; t++ ) {
-    datum precision = -1;
-    datum recall = -1;
-    datum f1 = -1;
-
-    if ( ( true_positives_[t] + false_positives_[t] ) > 0 )
-      precision = ( true_positives_[t] ) /
-                  ( true_positives_[t] + false_positives_[t] );
-
-    if ( ( true_positives_[t] + false_negatives_[t] ) > 0 )
-      recall = ( true_positives_[t] ) /
-               ( true_positives_[t] + false_negatives_[t] );
-
-    /*acc = ( true_positives_[t] + true_negatives_[t] ) /
-          ( true_positives_[t] + true_negatives_[t] +
-            false_negatives_[t] + false_positives_[t]
-          );
-
-    LOGDEBUG << "Accuracy (" << threshold_values_[t] << "): " << acc;*/
-
-    if ( precision >= 0 && recall >= 0 ) {
-      f1 = 2 * precision * recall / ( precision + recall );
-    }
-
-    if ( f1 > fmax ) {
-      fmax = f1;
-      tfmax = t;
-    }
-  }
-
-  datum fpr = -1;
-  datum fnr = -1;
-  datum precision = -1;
-  datum recall = -1;
-  datum f1 = -1;
-  datum acc = -1;
-
-  if ( ( true_positives_[tfmax] + false_positives_[tfmax] ) > 0 )
-    precision = ( true_positives_[tfmax] ) /
-                ( true_positives_[tfmax] + false_positives_[tfmax] );
-
-  if ( ( true_positives_[tfmax] + false_negatives_[tfmax] ) > 0 )
-    recall = ( true_positives_[tfmax] ) /
-             ( true_positives_[tfmax] + false_negatives_[tfmax] );
-
-  if ( ( false_positives_[tfmax] + true_negatives_[tfmax] ) > 0 )
-    fpr = ( false_positives_[tfmax] ) /
-          ( false_positives_[tfmax] + true_negatives_[tfmax] );
-
-  if ( ( true_positives_[tfmax] + false_negatives_[tfmax] ) > 0 )
-    fnr = ( false_negatives_[tfmax] ) /
-          ( true_positives_[tfmax] + false_negatives_[tfmax] );
-
-  if ( precision >= 0 && recall >= 0 )
-    f1 = 2 * precision * recall / ( precision + recall );
-  
-  acc = ( true_positives_[tfmax] + true_negatives_[tfmax] ) /
-          ( true_positives_[tfmax] + true_negatives_[tfmax] +
-            false_negatives_[tfmax] + false_positives_[tfmax]
-          );
-
-  ( training ? LOGTRESULT : LOGRESULT )
-      << prefix << " F1 : " << f1 * 100.0 << "% (t=" << threshold_values_[tfmax]
-      << ")" << LOGRESULTEND;
-  ( training ? LOGTRESULT : LOGRESULT )
-      << prefix << " ACC: " << acc * 100.0 << "%" << LOGRESULTEND;
-  ( training ? LOGTRESULT : LOGRESULT )
-      << prefix << " PRE: " << precision * 100.0 << "%" << LOGRESULTEND;
-  ( training ? LOGTRESULT : LOGRESULT )
-      << prefix << " REC: " << recall * 100.0 << "%" << LOGRESULTEND;
-  ( training ? LOGTRESULT : LOGRESULT )
-      << prefix << " FPR: " << fpr * 100.0 << "%" << LOGRESULTEND;
-  ( training ? LOGTRESULT : LOGRESULT )
-      << prefix << " FNR: " << fnr * 100.0 << "%" << LOGRESULTEND;
-
-
+  // Now deprecated
 }
 
 
diff --git a/src/net/ConfusionMatrixLayer.cpp b/src/net/ConfusionMatrixLayer.cpp
index 27c2f7a..1f189fa 100644
--- a/src/net/ConfusionMatrixLayer.cpp
+++ b/src/net/ConfusionMatrixLayer.cpp
@@ -8,12 +8,15 @@
 #include <string>
 #include <iomanip>
 #include <sstream>
+#include "../util/StatAggregator.h"
+
 #include "ConfusionMatrixLayer.h"
 
 namespace Conv {
 ConfusionMatrixLayer::ConfusionMatrixLayer (
   std::vector<std::string> names, const unsigned int classes ) :
-  classes_ ( classes ), names_ ( names ) {
+  classes_ ( classes ), names_ ( names )
+{
   LOGDEBUG << "Instance created, " << classes << " classes.";
   for(unsigned int n = 0; n < names_.size(); n++) {
     if(names_[n].length() > 11) {
@@ -21,6 +24,91 @@ ConfusionMatrixLayer::ConfusionMatrixLayer (
       names_[n] = original.substr(0,8) + "...";
     }
   }
+  // Initialize stat descriptors
+  stat_orr_ = new StatDescriptor;
+  stat_arr_ = new StatDescriptor;
+  stat_iou_ = new StatDescriptor;
+  
+  stat_orr_->description = "Overall Recognition Rate";
+  stat_orr_->unit = "%";
+  stat_orr_->nullable = true;
+  stat_orr_->init_function = [this] (Stat& stat) { stat.is_null = true; stat.value = 0; Reset();};
+  stat_orr_->update_function = [] (Stat& stat, double user_value) { stat.is_null = false; stat.value = user_value; };
+  stat_orr_->output_function = [] (HardcodedStats& hc_stats, Stat& stat) -> Stat {
+    return stat;
+  };
+  
+  stat_arr_->description = "Average Recognition Rate";
+  stat_arr_->unit = "%";
+  stat_arr_->nullable = true;
+  stat_arr_->init_function = [] (Stat& stat) { stat.is_null = true; stat.value = 0; };
+  stat_arr_->update_function = [] (Stat& stat, double user_value) { stat.is_null = false; stat.value = user_value; };
+  stat_arr_->output_function = [] (HardcodedStats& hc_stats, Stat& stat) -> Stat {
+    return stat;
+  };
+  
+  stat_iou_->description = "Average Intersection over Union";
+  stat_iou_->unit = "%";
+  stat_iou_->nullable = true;
+  stat_iou_->init_function = [] (Stat& stat) { stat.is_null = true; stat.value = 0; };
+  stat_iou_->update_function = [] (Stat& stat, double user_value) { stat.is_null = false; stat.value = user_value; };
+  stat_iou_->output_function = [] (HardcodedStats& hc_stats, Stat& stat) -> Stat {
+    return stat;
+  };
+  
+  // Register with StatAggregator
+  System::stat_aggregator->RegisterStat(stat_orr_);
+  System::stat_aggregator->RegisterStat(stat_arr_);
+  System::stat_aggregator->RegisterStat(stat_iou_);
+}
+  
+void ConfusionMatrixLayer::UpdateAll() {
+  // Don't call Update(...) when there are no samples to keep the null property of the value
+  if (total_ < 1.0)
+    return;
+  
+  long double orr = 0, arr = 0, iou = 0;
+  
+  // Calculate metrics...
+  
+  // Overall recognition rate
+  orr = 100.0L * right_ / total_;
+
+  // Average recognition rate
+  long double ccount = 0;
+  long double sum = 0;
+
+  for ( unsigned int c = 0; c < classes_; c++ ) {
+    if ( per_class_[c] > 0 ) {
+      sum += matrix_[ ( c * classes_ ) + c] / per_class_[c];
+      ccount += 1.0L;
+    }
+  }
+
+  arr = 100.0L * sum / ccount;
+
+  // Intersection over union
+  long double IU_sum = 0;
+  for(unsigned int t = 0; t < classes_; t++) {
+    // Calculate IU measure for class T
+    long double unionn = 0;
+    for(unsigned int c = 0; c < classes_; c++) {
+      if(c!=t) {
+        unionn += matrix_[ ( t * classes_ ) + c];
+        unionn += matrix_[ ( c * classes_ ) + t];
+      }
+    }
+    unionn += matrix_[ ( t * classes_) + t];
+    long double IU = (unionn > 0.0) ? (matrix_[ ( t * classes_) + t] / unionn) : 0.0;
+    IU_sum += IU;
+  }
+
+  iou = 100.0L * IU_sum / (long double)classes_;
+  
+  // Submit metrics to StatAggregator
+  System::stat_aggregator->Update(stat_orr_->stat_id, (double)orr);
+  System::stat_aggregator->Update(stat_arr_->stat_id, (double)arr);
+  System::stat_aggregator->Update(stat_iou_->stat_id, (double)iou);
 }
 
 bool ConfusionMatrixLayer::CreateOutputs (
@@ -128,6 +216,7 @@ void ConfusionMatrixLayer::Reset() {
 }
 
 void ConfusionMatrixLayer::Print ( std::string prefix, bool training ) {
+  // Print confusion matrix
   std::stringstream caption;
   caption << std::setw ( 12 ) << "vCLS  ACT>";
 
@@ -151,24 +240,9 @@ void ConfusionMatrixLayer::Print ( std::string prefix, bool training ) {
     caption.str ( "" );
   }
 
-
-  (training?LOGTRESULT:LOGRESULT) << prefix << " Overall recognition rate (not normalized): "
-            << 100.0L * right_ / total_ << "%";
-
-  long double ccount = 0;
-  long double sum = 0;
-
-  for ( unsigned int c = 0; c < classes_; c++ ) {
-    if ( per_class_[c] > 0 ) {
-      sum += matrix_[ ( c * classes_ ) + c] / per_class_[c];
-      ccount += 1.0L;
-    }
-  }
-
-  (training?LOGTRESULT:LOGRESULT) << prefix << " Average recognition rate (normalized)    : "
-            << 100.0 * sum / ccount << "%" << LOGRESULTEND;
-
+  // Print IOU
   long double IU_sum = 0;
+  
   for(unsigned int t = 0; t < classes_; t++) {
     // Calculate IU measure for class T
     long double unionn = 0;
@@ -183,14 +257,14 @@ void ConfusionMatrixLayer::Print ( std::string prefix, bool training ) {
     IU_sum += IU;
     caption << std::setw(12) << names_[t];
     caption << " IU: ";
-    caption << std::setw(12) << IU * 100.0;
+    caption << std::setw(12) << IU * 100.0L;
     caption << "%";
     (training?LOGTRESULT:LOGRESULT) << prefix << caption.str() << LOGRESULTEND;
     caption.str ( "" );
   }
 
   (training?LOGTRESULT:LOGRESULT) << prefix << " Average intersection over union          : "
-            << 100.0 * IU_sum / (long double)classes_ << "%" << LOGRESULTEND;
+            << 100.0L * IU_sum / (long double)classes_ << "%" << LOGRESULTEND;
 }
 
 ConfusionMatrixLayer::~ConfusionMatrixLayer() {
diff --git a/src/net/InputDownSamplingLayer.cpp b/src/net/InputDownSamplingLayer.cpp
new file mode 100644
index 0000000..4b08b61
--- /dev/null
+++ b/src/net/InputDownSamplingLayer.cpp
@@ -0,0 +1,98 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */  
+
+#include "Log.h"
+#include "TensorMath.h"
+
+#include "InputDownSamplingLayer.h"
+
+namespace Conv {
+
+InputDownSamplingLayer::InputDownSamplingLayer (const unsigned int region_width,
+                                  const unsigned int region_height) :
+  region_width_ (region_width), region_height_ (region_height) {
+  LOGDEBUG << "Instance created: " << region_width_ << "x" << region_height_ <<
+           " pooling.";
+}
+
+bool InputDownSamplingLayer::CreateOutputs (
+  const std::vector< CombinedTensor* >& inputs,
+  std::vector< CombinedTensor* >& outputs) {
+  // This is a simple layer, only one input
+  if (inputs.size() != 1) {
+    LOGERROR << "Only one input supported!";
+    return false;
+  }
+
+  // Save input node pointer
+  CombinedTensor* input = inputs[0];
+
+  // Check if input node pointer is null
+  if (input == nullptr) {
+    LOGERROR << "Null pointer input node!";
+    return false;
+  }
+
+  // Validate dimensions
+  if ( (input->data.width() % region_width_) != 0 ||
+       (input->data.height() % region_height_) != 0) {
+    LOGERROR << "Input dimensions not divisible by region dimensions!";
+    return false;
+  }
+
+  // Create output
+  CombinedTensor* output = new CombinedTensor (input->data.samples(),
+      input->data.width() / region_width_, input->data.height() / region_height_,
+      input->data.maps());
+
+  // Tell network about the output
+  outputs.push_back (output);
+
+  return true;
+}
+
+bool InputDownSamplingLayer::Connect (const CombinedTensor* input,
+                               CombinedTensor* output) {
+  // TODO Validate dimensions
+  bool valid = true;
+
+  if (!valid) {
+    LOGERROR << "Invalid dimensions!";
+    return false;
+  }
+
+  // Save dimensions
+  input_width_ = input->data.width();
+  input_height_ = input->data.height();
+  output_width_ = output->data.width();
+  output_height_ = output->data.height();
+
+  maps_ = input->data.maps();
+
+  return true;
+}
+
+void InputDownSamplingLayer::FeedForward() {
+  TensorMath::DOWN(input_->data, output_->data, region_width_, region_height_, 1.0f / ((datum)region_width_ * (datum)region_height_));
+}
+
+void InputDownSamplingLayer::BackPropagate() {
+  if(backprop_enabled_) {
+    FATAL("This is a pre-processing layer that does not support backpropagation!");
+  }
+}
+
+
+bool InputDownSamplingLayer::IsOpenCLAware() {
+#ifdef BUILD_OPENCL_MAX
+  return true;
+#else
+  return false;
+#endif
+}
+
+}
diff --git a/src/net/NetGraph.cpp b/src/net/NetGraph.cpp
index c7ff22c..4360d72 100644
--- a/src/net/NetGraph.cpp
+++ b/src/net/NetGraph.cpp
@@ -166,12 +166,12 @@ void NetGraph::PrintGraph(std::ostream& graph_output) {
 			for (unsigned int i = 0; i < node->output_buffers.size(); i++) {
 				if (i > 0)
 					node_output << "|";
-				node_output << "<o" << i << ">" << node->output_buffers[i].description;
+				node_output << "<o" << i << ">" << node->output_buffers[i].description << " " << node->output_buffers[i].combined_tensor->data;
 			}
 			node_output << "}";
 		}
 		else if (node->output_buffers.size() == 1) {
-			node_output << "| <o0> " << node->output_buffers[0].description;
+			node_output << "| <o0> " << node->output_buffers[0].description << " " << node->output_buffers[0].combined_tensor->data;
 		}
 		node_output << "}\"];\n";
 
diff --git a/src/net/SumLayer.cpp b/src/net/SumLayer.cpp
new file mode 100644
index 0000000..3d0b7fb
--- /dev/null
+++ b/src/net/SumLayer.cpp
@@ -0,0 +1,115 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */
+
+#include <cstring>
+
+#include "TensorMath.h"
+#include "SumLayer.h"
+
+namespace Conv {
+
+
+SumLayer::SumLayer() {
+  LOGDEBUG << "Instance created.";
+}
+
+bool SumLayer::CreateOutputs (const std::vector< CombinedTensor* >& inputs,
+                                 std::vector< CombinedTensor* >& outputs) {
+  if(inputs.size() != 2) {
+    LOGERROR << "Needs two inputs!";
+    return false;
+  }
+  
+  CombinedTensor* input_a = inputs[0];
+  CombinedTensor* input_b = inputs[1];
+  
+  if(input_a == nullptr || input_b == nullptr) {
+    LOGERROR << "Null pointer supplied";
+    return false;
+  }
+  
+  if(input_a->data.width() != input_b->data.width()
+    && input_a->data.height() != input_b->data.height()) {
+    LOGERROR << "Dimensions don't match!";
+  }
+  
+  if(input_a->data.samples() != input_b->data.samples()) {
+    LOGERROR << "Sample count doesn't match!";
+    return false;
+  }
+  
+  unsigned int maps_a = input_a->data.maps();
+  unsigned int maps_b = input_b->data.maps();
+  
+  if(maps_a != maps_b) {
+    LOGERROR << "Map count doesn't match";
+    return false;
+  }
+  
+  unsigned int samples = input_a->data.samples();
+  CombinedTensor* output = new CombinedTensor(samples, input_a->data.width(),
+    input_b->data.height(), maps_a);
+  
+  outputs.push_back(output);
+  return true;
+}
+
+bool SumLayer::Connect (const std::vector< CombinedTensor* >& inputs,
+                           const std::vector< CombinedTensor* >& outputs,
+                           const NetStatus* status ) {
+  if(inputs.size() != 2) {
+    LOGERROR << "Needs two inputs!";
+    return false;
+  }
+  
+  if(outputs.size() != 1) {
+    LOGERROR << "Needs exactly one output!";
+    return false;
+  }
+  
+  CombinedTensor* input_a = inputs[0];
+  CombinedTensor* input_b = inputs[1];
+  CombinedTensor* output = outputs[0];
+  
+  if(input_a == nullptr || input_b == nullptr || output == nullptr) {
+    LOGERROR << "Null pointer supplied";
+    return false;
+  }
+  
+  if(input_a->data.samples() != input_b->data.samples()) {
+    LOGERROR << "Sample count doesn't match!";
+    return false;
+  }
+  
+  if((output->data.elements() != input_a->data.elements())
+    && (output_->data.elements() != input_b->data.elements())) {
+    LOGERROR << "Wrong output dimensions!";
+    return false;
+  }
+  
+  maps_ = input_a->data.maps();
+  samples_ = input_a->data.samples();
+  
+  input_a_ = input_a;
+  input_b_ = input_b;
+  output_ = output;
+  
+  return true;
+}
+
+void SumLayer::FeedForward() {
+  TensorMath::ADD(input_a_->data, input_b_->data, output_->data);
+}
+
+void SumLayer::BackPropagate() {
+  for(unsigned int sample = 0; sample < samples_; sample++) {
+    Tensor::CopySample(output_->delta, sample, input_a_->delta, sample);
+    Tensor::CopySample(output_->delta, sample, input_b_->delta, sample);
+  }
+}
+
+}
\ No newline at end of file
diff --git a/src/net/Trainer.cpp b/src/net/Trainer.cpp
index 0cfec4c..f5ee335 100644
--- a/src/net/Trainer.cpp
+++ b/src/net/Trainer.cpp
@@ -12,66 +12,240 @@
 #include "Net.h"
 #include "StatLayer.h"
 #include "CLHelper.h"
+#include "StatAggregator.h"
+#include "Init.h"
 
 #include "Trainer.h"
 
+
 namespace Conv {
 
-	Trainer::Trainer(Conv::NetGraph& graph, TrainerSettings settings) :
-		graph_(graph), settings_(settings) {
-		LOGDEBUG << "Instance created";
+bool Trainer::stats_are_initialized_ = false;
+StatDescriptor* Trainer::stat_aggloss_ = nullptr;
+StatDescriptor* Trainer::stat_qp_caseA_ = nullptr;
+StatDescriptor* Trainer::stat_qp_caseB_ = nullptr;
+StatDescriptor* Trainer::stat_qp_caseC_ = nullptr;
+StatDescriptor* Trainer::stat_qp_caseM_ = nullptr;
+StatDescriptor* Trainer::stat_sps_ = nullptr;
+StatDescriptor* Trainer::stat_fps_ = nullptr;
+
+void Trainer::InitializeStats() {
+  // Only initialize stats once
+  if (!stats_are_initialized_) {
+
+    stat_aggloss_ = new StatDescriptor;
+    stat_aggloss_->nullable = true;
+    stat_aggloss_->description = "Average Aggregate Loss";
+    stat_aggloss_->unit = "1/pixel";
+    stat_aggloss_->init_function =
+      [](Stat& stat) {stat.is_null = true; stat.value = 0.0;};
+    stat_aggloss_->update_function =
+      [](Stat& stat, double user_value) {stat.value += user_value; stat.is_null = false;};
+    stat_aggloss_->output_function =
+      [](HardcodedStats& hc_stats, Stat& stat) -> Stat {
+      Stat return_stat; return_stat.is_null = true;
+      if (hc_stats.iterations > 0) {
+        double d_iterations = (double)hc_stats.iterations;
+        return_stat.value = stat.value / d_iterations;
+        return_stat.is_null = false;
+      }
+      return return_stat;
+    };
+      
+    stat_qp_caseA_ = new StatDescriptor;
+    stat_qp_caseA_->nullable = true;
+    stat_qp_caseA_->description = "QuickProp Case A Percentage";
+    stat_qp_caseA_->unit = "%";
+    stat_qp_caseA_->init_function =
+      [](Stat& stat) {stat.is_null = true; stat.value = 0.0;};
+    stat_qp_caseA_->update_function =
+      [](Stat& stat, double user_value) {stat.value += user_value; stat.is_null = false;};
+    stat_qp_caseA_->output_function = 
+      [](HardcodedStats& hc_stats, Stat& stat) -> Stat {
+      Stat return_stat; return_stat.is_null = true;
+      if (hc_stats.iterations > 0 && hc_stats.weights > 0 && !stat.is_null) {
+        double d_iterations = (double)hc_stats.iterations;
+        double d_weights = (double)hc_stats.weights;
+        return_stat.value = 100.0 * stat.value / (d_iterations * d_weights);
+        return_stat.is_null = false;
+      }
+      return return_stat;
+    };
+    
+    stat_qp_caseB_ = new StatDescriptor;
+    stat_qp_caseB_->nullable = true;
+    stat_qp_caseB_->description = "QuickProp Case B Percentage";
+    stat_qp_caseB_->unit = "%";
+    stat_qp_caseB_->init_function =
+      [](Stat& stat) {stat.is_null = true; stat.value = 0.0;};
+    stat_qp_caseB_->update_function =
+      [](Stat& stat, double user_value) {stat.value += user_value; stat.is_null = false;};
+    stat_qp_caseB_->output_function = 
+      [](HardcodedStats& hc_stats, Stat& stat) -> Stat {
+      Stat return_stat; return_stat.is_null = true;
+      if (hc_stats.iterations > 0 && hc_stats.weights > 0 && !stat.is_null) {
+        double d_iterations = (double)hc_stats.iterations;
+        double d_weights = (double)hc_stats.weights;
+        return_stat.value = 100.0 * stat.value / (d_iterations * d_weights);
+        return_stat.is_null = false;
+      }
+      return return_stat;
+    };
+    
+    stat_qp_caseC_ = new StatDescriptor;
+    stat_qp_caseC_->nullable = true;
+    stat_qp_caseC_->description = "QuickProp Case C Percentage";
+    stat_qp_caseC_->unit = "%";
+    stat_qp_caseC_->init_function =
+      [](Stat& stat) {stat.is_null = true; stat.value = 0.0;};
+    stat_qp_caseC_->update_function =
+      [](Stat& stat, double user_value) {stat.value += user_value; stat.is_null = false;};
+    stat_qp_caseC_->output_function = 
+      [](HardcodedStats& hc_stats, Stat& stat) -> Stat {
+      Stat return_stat; return_stat.is_null = true;
+      if (hc_stats.iterations > 0 && hc_stats.weights > 0 && !stat.is_null) {
+        double d_iterations = (double)hc_stats.iterations;
+        double d_weights = (double)hc_stats.weights;
+        return_stat.value = 100.0 * stat.value / (d_iterations * d_weights);
+        return_stat.is_null = false;
+      }
+      return return_stat;
+    };
+    
+    stat_qp_caseM_ = new StatDescriptor;
+    stat_qp_caseM_->nullable = true;
+    stat_qp_caseM_->description = "QuickProp Case M Percentage";
+    stat_qp_caseM_->unit = "%";
+    stat_qp_caseM_->init_function =
+      [](Stat& stat) {stat.is_null = true; stat.value = 0.0;};
+    stat_qp_caseM_->update_function =
+      [](Stat& stat, double user_value) {stat.value += user_value; stat.is_null = false;};
+    stat_qp_caseM_->output_function = 
+      [](HardcodedStats& hc_stats, Stat& stat) -> Stat {
+      Stat return_stat; return_stat.is_null = true;
+      if (hc_stats.iterations > 0 && hc_stats.weights > 0 && !stat.is_null) {
+        double d_iterations = (double)hc_stats.iterations;
+        double d_weights = (double)hc_stats.weights;
+        return_stat.value = 100.0 * stat.value / (d_iterations * d_weights);
+        return_stat.is_null = false;
+      }
+      return return_stat;
+    };
+    
+    stat_sps_ = new StatDescriptor;
+    stat_sps_->nullable = true;
+    stat_sps_->description = "Pixel Throughput";
+    stat_sps_->unit = "pixels/s";
+    stat_sps_->init_function =
+      [](Stat& stat) {stat.is_null = true; stat.value = 0.0;};
+    stat_sps_->update_function =
+      [](Stat& stat, double user_value) {stat.value += user_value; stat.is_null = false;};
+    stat_sps_->output_function =
+      [] (Conv::HardcodedStats& hc_stats, Conv::Stat& stat) {
+        Conv::Stat return_stat = stat;
+        return_stat.value = stat.value / hc_stats.seconds_elapsed;
+        return return_stat;
+      };
+    
+    stat_fps_ = new StatDescriptor;
+    stat_fps_->nullable = true;
+    stat_fps_->description = "Frame Rate";
+    stat_fps_->unit = "frames/s";
+    stat_fps_->init_function =
+      [](Stat& stat) {stat.is_null = true; stat.value = 0.0;};
+    stat_fps_->update_function =
+      [](Stat& stat, double user_value) {stat.value += user_value; stat.is_null = false;};
+    stat_fps_->output_function =
+      [] (Conv::HardcodedStats& hc_stats, Conv::Stat& stat) {
+        Conv::Stat return_stat = stat;
+        return_stat.value = stat.value / hc_stats.seconds_elapsed;
+        return return_stat;
+      };
+    
+    // Register stats
+    System::stat_aggregator->RegisterStat(stat_aggloss_);
+    System::stat_aggregator->RegisterStat(stat_qp_caseA_);
+    System::stat_aggregator->RegisterStat(stat_qp_caseB_);
+    System::stat_aggregator->RegisterStat(stat_qp_caseC_);
+    System::stat_aggregator->RegisterStat(stat_qp_caseM_);
+    System::stat_aggregator->RegisterStat(stat_sps_);
+    System::stat_aggregator->RegisterStat(stat_fps_);
+    stats_are_initialized_ = true;
+  }
+  
+  // Move lambdas with reference captures here
+}
 
-		// We need a training layer to select training samples and some kind of
-		// loss function to minimize
-		if (graph_.GetTrainingNodes().size() == 0 || graph_.GetLossNodes().size() == 0) {
-			FATAL("Net doesn't have training layer or loss function layer!");
-		}
+Trainer::Trainer(Conv::NetGraph& graph, TrainerSettings settings) :
+  graph_(graph), settings_(settings) {
+  LOGDEBUG << "Instance created";
 
-		// Ask the Net for parameters
-		graph_.GetParameters(parameters_);
+  // We need a training layer to select training samples and some kind of
+  // loss function to minimize
+  if (graph_.GetTrainingNodes().size() == 0 || graph_.GetLossNodes().size() == 0) {
+    FATAL("Net doesn't have training layer or loss function layer!");
+  }
 
-		LOGDEBUG << "Optimizing " << parameters_.size() << " sets of parameters.";
+  // Ask the Net for parameters
+  graph_.GetParameters(parameters_);
 
-		unsigned int w = 0;
+  LOGDEBUG << "Optimizing " << parameters_.size() << " sets of parameters.";
 
-		for (unsigned int p = 0; p < parameters_.size(); p++) {
-			w += parameters_[p]->data.elements();
+  unsigned int w = 0;
 
-      // Allocate Tensors for momentum
-      Tensor* last_delta = new Tensor();
-      Tensor* last_gradient = new Tensor();
-      Tensor* accumulated_gradient = new Tensor();
-      last_delta->Resize (parameters_[p]->data);
-      last_delta->Clear();
-      last_gradient->Resize (parameters_[p]->data);
-      last_gradient->Clear();
-      accumulated_gradient->Resize (parameters_[p]->data);
-      accumulated_gradient->Clear();
+  for (unsigned int p = 0; p < parameters_.size(); p++) {
+    w += parameters_[p]->data.elements();
 
-      last_deltas_.push_back (last_delta);
-      last_gradients_.push_back (last_gradient);
-      accumulated_gradients_.push_back (accumulated_gradient);
-    }
+    // Allocate Tensors for momentum
+    Tensor* last_delta = new Tensor();
+    Tensor* last_gradient = new Tensor();
+    Tensor* accumulated_gradient = new Tensor();
+    last_delta->Resize (parameters_[p]->data);
+    last_delta->Clear();
+    last_gradient->Resize (parameters_[p]->data);
+    last_gradient->Clear();
+    accumulated_gradient->Resize (parameters_[p]->data);
+    accumulated_gradient->Clear();
 
-		// Outputs the number of weights
-		LOGDEBUG << "Weights: " << w;
+    last_deltas_.push_back (last_delta);
+    last_gradients_.push_back (last_gradient);
+    accumulated_gradients_.push_back (accumulated_gradient);
+  }
+
+  // Outputs the number of weights
+  LOGDEBUG << "Weights: " << w;
+  weight_count_ = w;
+
+  first_training_layer_ = dynamic_cast<TrainingLayer*>(graph_.GetTrainingNodes()[0]->layer);
+  sample_count_ = first_training_layer_->GetLabelWidth() * first_training_layer_->GetLabelHeight()
+  * first_training_layer_->GetBatchSize();
 
-		first_training_layer_ = dynamic_cast<TrainingLayer*>(graph_.GetTrainingNodes()[0]->layer);
-		sample_count_ = first_training_layer_->GetLabelWidth() * first_training_layer_->GetLabelHeight()
-    * first_training_layer_->GetBatchSize();
+  InitializeStats();
 }
 
-void Trainer::Train (unsigned int epochs) {
-  // net_.SetTestOnlyStatDisabled (false);
-  graph_.SetIsTesting(false);
+void Trainer::Train (unsigned int epochs, bool do_snapshots) {
+  // Update hardcoded stats
+  System::stat_aggregator->hardcoded_stats_.weights = weight_count_;
 
-  for (unsigned int e = 0; e < epochs; e++)
+  graph_.SetIsTesting(false);
+  graph_.SetStatLayersEnabled(settings_.stats_during_training);
+  
+  for (unsigned int e = 0; e < epochs; e++) {
     Epoch();
+    if(do_snapshots) {
+      System::stat_aggregator->Snapshot();
+      // Update hardcoded stats
+      System::stat_aggregator->hardcoded_stats_.weights = weight_count_;
+    }
+  }
 
-  // net_.SetTestOnlyStatDisabled (false);
+  graph_.SetStatLayersEnabled(true);
 }
 
 void Trainer::Test() {
+  // Update hardcoded stats
+  System::stat_aggregator->hardcoded_stats_.weights = weight_count_;
+
 	datum aggregate_loss = 0.0;
 	datum* loss_sums = new datum[graph_.GetLossNodes().size()];
 	for (unsigned int n = 0; n < graph_.GetLossNodes().size(); n++)
@@ -90,9 +264,8 @@ void Trainer::Test() {
   LOGDEBUG << "Testing, iterations: " << iterations <<
            ", batch size: " << first_training_layer_->GetBatchSize();
 
-  auto t_begin = std::chrono::system_clock::now();
-
   for (unsigned int i = 0; i < iterations; i++) {
+    aggregate_loss = 0.0;
     graph_.FeedForward();
     for (unsigned int n = 0; n < graph_.GetLossNodes().size(); n++) {
       LossFunctionLayer* lossfunction_layer = dynamic_cast<LossFunctionLayer*>(graph_.GetLossNodes()[n]->layer);
@@ -100,41 +273,43 @@ void Trainer::Test() {
 			loss_sums[n] += loss;
 			aggregate_loss += loss;
 		}
-	}
+    // Batch/Iteration done
+    if (System::stat_aggregator->state_ == StatAggregator::RECORDING)
+      System::stat_aggregator->hardcoded_stats_.iterations++;
 
-  auto t_end = std::chrono::system_clock::now();
-  std::chrono::duration<double> t_diff = t_end - t_begin;
-  LOGDEBUG << "Testing, sps: " <<
-          (datum) (sample_count_ * iterations)
-          / (datum) t_diff.count();
+    // Update aggregate loss stat
+    System::stat_aggregator->Update(stat_aggloss_->stat_id, aggregate_loss
+      / sample_count_ );
 
-  LOGDEBUG << "Testing, tps: " <<
-          1000000.0f * (datum) t_diff.count() /
-          (datum) (sample_count_ * iterations) << " us";
+	}
+
+  // Submit performance statistics
+  System::stat_aggregator->Update(stat_sps_->stat_id, (double)sample_count_ * (double)iterations);
+  System::stat_aggregator->Update(stat_fps_->stat_id, (double)(first_training_layer_->GetBatchSize()) * (double)iterations);
 
 	for (unsigned int n = 0; n < graph_.GetLossNodes().size(); n++) {
 		LossFunctionLayer* lossfunction_layer = dynamic_cast<LossFunctionLayer*>(graph_.GetLossNodes()[n]->layer);
 		LOGINFO << "Testing (Epoch " << epoch_ << ", node " << n << ") " << graph_.GetLossNodes()[n]->layer->GetLayerDescription() <<  " lps: " << loss_sums[n] / (datum)(iterations * sample_count_);
 	}
-	LOGINFO << "Testing (Epoch " << epoch_ << ") aggregate lps: " << aggregate_loss / (datum)(iterations * sample_count_);
 
 	for (unsigned int n = 0; n < graph_.GetStatNodes().size(); n++) {
 		StatLayer* stat_layer = dynamic_cast<StatLayer*>(graph_.GetStatNodes()[n]->layer);
     std::stringstream epochname;
     epochname << "Testing  - Epoch " << epoch_ << " -";
+    stat_layer->UpdateAll();
     stat_layer->Print (epochname.str(), false);
-    stat_layer->Reset();
 	}
 
 	for (NetGraphNode* training_node : graph_.GetTrainingNodes())
 		(dynamic_cast<TrainingLayer*>(training_node->layer))->SetTestingMode(false);
 
-  graph_.SetIsTesting(false);
-
 	delete[] loss_sums;
 }
 
 void Trainer::Epoch() {
+  // Update hardcoded epoch stat
+  System::stat_aggregator->hardcoded_stats_.epoch = epoch_;
+
 	datum aggregate_loss = 0.0;
 	datum* loss_sums = new datum[graph_.GetLossNodes().size()];
 	for (unsigned int n = 0; n < graph_.GetLossNodes().size(); n++)
@@ -157,8 +332,6 @@ void Trainer::Epoch() {
            ", bsize: " << first_training_layer_->GetBatchSize() * settings_.sbatchsize << ", current lr: " <<
            CalculateLR (epoch_ * iterations) << std::endl;
 
-  auto t_begin = std::chrono::system_clock::now();
-
   for (unsigned int i = 0; i < iterations; i++) {
     if ( (50 * i / iterations) > fiftieth) {
       fiftieth = 50 * i / iterations;
@@ -169,6 +342,7 @@ void Trainer::Epoch() {
       tenth = 10 * i / iterations;
       std::cout << tenth << "0%" << std::flush;
     }
+    aggregate_loss = 0.0;
 
     // Reset gradients
     for (unsigned int np = 0; np < accumulated_gradients_.size(); np++)
@@ -207,48 +381,41 @@ void Trainer::Epoch() {
         }
       }
     }
-
     // Calculate annealed learning rate
     const datum lr =
       CalculateLR (epoch_ * iterations + i);
 
     // Apply gradients with new learning rate
     ApplyGradients (lr);
-  }
 
-  auto t_end = std::chrono::system_clock::now();
-  std::chrono::duration<double> t_diff = t_end - t_begin;
-  LOGDEBUG << "Training, sps: " <<
-          (datum) (sample_count_ * settings_.sbatchsize
-                   * first_training_layer_->GetLossSamplingProbability() * iterations)
-          / (datum) t_diff.count();
-
-  LOGDEBUG << "Training, tps: " <<
-          1000000.0f * (datum) t_diff.count() /
-          (datum) (sample_count_ * settings_.sbatchsize
-                   * first_training_layer_->GetLossSamplingProbability() * iterations) << " us";
-                  
-#ifdef BUILD_OPENCL
-  LOGDEBUG << "Training, GB/s   up: " << ((datum)CLHelper::bytes_up)/(1073741824.0 * (datum)t_diff.count());
-  LOGDEBUG << "Training, GB/s down: " << ((datum)CLHelper::bytes_down)/(1073741824.0 * (datum)t_diff.count());
-  CLHelper::bytes_up = 0;
-  CLHelper::bytes_down = 0;
-#endif
+    // Batch/Iteration done
+    if (System::stat_aggregator->state_ == StatAggregator::RECORDING)
+      System::stat_aggregator->hardcoded_stats_.iterations++;
 
+    // Update aggregate loss stat
+    System::stat_aggregator->Update(stat_aggloss_->stat_id, aggregate_loss
+      / (first_training_layer_->GetLossSamplingProbability() * sample_count_ * settings_.sbatchsize));
+  }
+
+  // Submit performance statistics
+  System::stat_aggregator->Update(stat_sps_->stat_id, (double)sample_count_ * (double)iterations * (double)(settings_.sbatchsize));
+  System::stat_aggregator->Update(stat_fps_->stat_id, (double)(first_training_layer_->GetBatchSize()) * (double)iterations * (double)(settings_.sbatchsize));
+  
   // Display training epoch_error
 	for (unsigned int n = 0; n < graph_.GetLossNodes().size(); n++) {
 		LossFunctionLayer* lossfunction_layer = dynamic_cast<LossFunctionLayer*>(graph_.GetLossNodes()[n]->layer);
 		LOGINFO << "Training (Epoch " << epoch_ << ", node " << n << ") " << graph_.GetLossNodes()[n]->layer->GetLayerDescription() <<  " lps: " << loss_sums[n] / (datum)(iterations * sample_count_ * settings_.sbatchsize * first_training_layer_->GetLossSamplingProbability());
 	}
-	LOGINFO << "Training (Epoch " << epoch_ << ") aggregate lps: " << aggregate_loss / (datum)(iterations * sample_count_ * settings_.sbatchsize * first_training_layer_->GetLossSamplingProbability());
 
-	for (unsigned int n = 0; n < graph_.GetStatNodes().size(); n++) {
-		StatLayer* stat_layer = dynamic_cast<StatLayer*>(graph_.GetStatNodes()[n]->layer);
-    std::stringstream epochname;
-    epochname << "Training  - Epoch " << epoch_ << " -";
-    stat_layer->Print (epochname.str(), true);
-    stat_layer->Reset();
-	}
+  if(settings_.stats_during_training) {
+    for (unsigned int n = 0; n < graph_.GetStatNodes().size(); n++) {
+      StatLayer* stat_layer = dynamic_cast<StatLayer*>(graph_.GetStatNodes()[n]->layer);
+      std::stringstream epochname;
+      epochname << "Training  - Epoch " << epoch_ << " -";
+      stat_layer->UpdateAll();
+      stat_layer->Print (epochname.str(), true);
+    }
+  }
 
   delete[] loss_sums;
   epoch_++;
@@ -256,7 +423,8 @@ void Trainer::Epoch() {
 
 void Trainer::ApplyGradients (datum lr) {
   unsigned int dp = 0;
-
+  unsigned int qp_caseA = 0, qp_caseB = 0, qp_caseC = 0, qp_caseM = 0;
+  
 	for (unsigned int l = 0; l < graph_.GetNodes().size(); l++) {
 		Layer* const layer = graph_.GetNodes()[l]->layer;
     datum layer_lr;
@@ -290,7 +458,7 @@ void Trainer::ApplyGradients (datum lr) {
         datum delta =
         
           // Average of gradient over minibatch
-          layer_lr * (w_gradient / (datum) (sample_count_ * settings_.sbatchsize)) +
+          layer_lr * (w_gradient / ((datum) (sample_count_ * settings_.sbatchsize)) * first_training_layer_->GetLossSamplingProbability()) +
           // Regularization
           layer_lr * (settings_.l2_weight * l2_gradient + settings_.l1_weight * l1_gradient);
         
@@ -313,29 +481,36 @@ void Trainer::ApplyGradients (datum lr) {
             const datum s = settings_.mu / (1.0 + settings_.mu);
             
             datum step = 0;
-            if(last_step > 0.001) {
+            if(last_step > 0.00001) {
               if(delta > 0.0) {
                 step += lr * settings_.eta * delta;
+                qp_caseB++;
               }
               
               if(delta > (s * last_gradient)) {
                 step += settings_.mu * last_step;
+                qp_caseM++;
               } else {
                 step += last_step * delta / (last_gradient - delta);
               }
+              qp_caseA++;
               
-            } else if(last_step < -0.001) {
+            } else if(last_step < -0.00001) {
               if(delta < 0.0) {
                 step += lr * settings_.eta * delta;
+                qp_caseB++;
               }
               
               if(delta < (s * last_gradient)) {
                 step += settings_.mu * last_step;
+                qp_caseM++;
               } else {
                 step += last_step * delta / (last_gradient - delta);
               }
+              qp_caseA++;
             } else {
               step += lr * settings_.eta * delta;
+              qp_caseC++;
             }
             
             if(step > 1000 || step < -1000) {
@@ -358,6 +533,14 @@ void Trainer::ApplyGradients (datum lr) {
       dp++;
     }
   }
+  
+  // Update quickprop stats
+  if(settings_.optimization_method == QUICKPROP) {
+    System::stat_aggregator->Update(stat_qp_caseA_->stat_id, (double)qp_caseA);
+    System::stat_aggregator->Update(stat_qp_caseB_->stat_id, (double)qp_caseB);
+    System::stat_aggregator->Update(stat_qp_caseC_->stat_id, (double)qp_caseC);
+    System::stat_aggregator->Update(stat_qp_caseM_->stat_id, (double)qp_caseM);
+  }
 }
 
 std::ostream& operator<< (std::ostream & output,
diff --git a/src/net/UpscaleLayer.cpp b/src/net/UpscaleLayer.cpp
index d0f1cf9..04d59a1 100644
--- a/src/net/UpscaleLayer.cpp
+++ b/src/net/UpscaleLayer.cpp
@@ -8,6 +8,7 @@
 
 #include "Log.h"
 #include "Init.h"
+#include "TensorMath.h"
 
 #include "UpscaleLayer.h"
 
@@ -70,48 +71,11 @@ bool UpscaleLayer::Connect ( const CombinedTensor* input,
 }
 
 void UpscaleLayer::FeedForward() {
-  #pragma omp parallel for default(shared)
-
-  for ( std::size_t sample = 0; sample < input_->data.samples(); sample++ ) {
-    for ( unsigned int map = 0; map < maps_; map++ ) {
-      for ( unsigned int ox = 0; ox < output_width_; ox++ ) {
-        for ( unsigned int oy = 0; oy < output_height_; oy++ ) {
-          const unsigned int ix = ox / region_width_;
-          const unsigned int iy = oy / region_height_;
-          const datum ival = *input_->data.data_ptr_const ( ix, iy, map, sample );
-          // Feed forward
-          *output_->data.data_ptr ( ox, oy, map, sample ) = ival;
-        }
-      }
-    }
-  }
+ TensorMath::UP(input_->data, output_->data, region_width_, region_height_, 1.0f);
 }
 
 void UpscaleLayer::BackPropagate() {
-  #pragma omp parallel for default(shared)
-
-  for ( std::size_t sample = 0; sample < input_->data.samples(); sample++ ) {
-    for ( unsigned int map = 0; map < maps_; map++ ) {
-      for ( unsigned int ix = 0; ix < input_width_; ix++ ) {
-        for ( unsigned int iy = 0; iy < input_height_; iy++ ) {
-          const unsigned int ox = ix * region_width_;
-          const unsigned int oy = iy * region_height_;
-          datum sum = 0;
-
-          for ( unsigned int ry = 0; ry < region_height_; ry++ ) {
-            for ( unsigned int rx = 0; rx < region_width_; rx++ ) {
-              sum += *output_->delta.data_ptr_const ( ox + rx, oy +ry, map, sample );
-            }
-          }
-
-          *input_->delta.data_ptr ( ix,iy,map,sample ) = sum; // (datum)(region_width_ * region_height_);
-        }
-      }
-    }
-  }
-
-  return;
-
+  TensorMath::DOWN(output_->delta, input_->delta, region_width_, region_height_, 1.0f);
 }
 
 }
diff --git a/src/util/CSVStatSink.cpp b/src/util/CSVStatSink.cpp
new file mode 100644
index 0000000..4f7ac90
--- /dev/null
+++ b/src/util/CSVStatSink.cpp
@@ -0,0 +1,94 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */
+
+#include <fstream>
+#include <sstream>
+
+#include <iomanip>
+#include <limits>
+
+#include <cctype>
+#include <string>
+#include <functional>
+#include <algorithm>
+
+#include "CSVStatSink.h"
+
+namespace Conv {
+  
+void CSVStatSink::Initialize(std::vector<StatDescriptor*>& stat_descriptors) {
+  stat_descriptors_ = stat_descriptors;
+}
+  
+void CSVStatSink::Process(Conv::HardcodedStats &hardcoded_stats, std::vector<Stat *> &stats) {
+  if(csv_stream_ == nullptr)
+    return;
+  
+  // Write hardcoded stats
+  (*csv_stream_) << (hardcoded_stats.is_training ? "1" : "0") << ",";
+  (*csv_stream_) << hardcoded_stats.epoch << ",";
+  (*csv_stream_) << hardcoded_stats.iterations << ",";
+  (*csv_stream_) << std::setprecision(std::numeric_limits<double>::digits10 + 1) << hardcoded_stats.seconds_elapsed << ",";
+  
+  // Write values...
+  for (unsigned int s = 0; s < stat_descriptors_.size(); s++) {
+    // ...but only if not NULL
+    if(!stats[s]->is_null)
+      (*csv_stream_) << std::setprecision(std::numeric_limits<double>::digits10 + 1) << stats[s]->value;
+    
+    // Add comma except for last line
+    if(s < (stat_descriptors_.size() - 1))
+      (*csv_stream_) << ",";
+  }
+  (*csv_stream_) << "\n";
+  (*csv_stream_) << std::flush;
+  
+}
+  
+bool isnalnum(char c) {
+  return !std::isalnum((int)c);
+}
+  
+void CSVStatSink::SetCurrentExperiment(std::string current_experiment) {
+  // Close stream if already open
+  if(csv_stream_ != nullptr) {
+    csv_stream_->close();
+    delete csv_stream_;
+  }
+  
+  // Generate filename
+  std::stringstream csv_filename_ss;
+  csv_filename_ss << "csv/" << current_experiment << ".csv";
+  std::string csv_filename=csv_filename_ss.str();
+  
+  // Open new stream
+  csv_stream_ = new std::ofstream(csv_filename, std::ios::out);
+  
+  // Test if stream works
+  if(!csv_stream_ ->good()) {
+    LOGERROR << "Cannot open " << csv_filename << " for writing!";
+    delete csv_stream_;
+    csv_stream_ = nullptr;
+  }
+  // Write header for hardcoded stats
+  (*csv_stream_) << "IsTraining,Epoch,Iterations,SecondsElapsed,";
+  
+  // Write header for non-hardcoded stats
+  for (unsigned int s = 0; s < stat_descriptors_.size(); s++) {
+    std::string description = stat_descriptors_[s]->description;
+    
+    // Strip non-alphanumeric characters
+    description.erase(std::remove_if(description.begin(), description.end(), isnalnum), description.end());
+    (*csv_stream_) << description;
+    if(s < (stat_descriptors_.size() - 1))
+      (*csv_stream_) << ",";
+  }
+  (*csv_stream_) << "\n";
+  (*csv_stream_) << std::flush;
+}
+  
+}
\ No newline at end of file
diff --git a/src/util/CompressedTensor.cpp b/src/util/CompressedTensor.cpp
new file mode 100644
index 0000000..b8a76a5
--- /dev/null
+++ b/src/util/CompressedTensor.cpp
@@ -0,0 +1,349 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <fstream>
+#include <limits>
+#include <cmath>
+#include <string>
+
+
+#ifdef BUILD_POSIX
+#include <sys/mman.h>
+#include <errno.h>
+#include <unistd.h>
+#endif
+
+#include "PNGUtil.h"
+#include "JPGUtil.h"
+
+#ifdef BLAS_MKL
+#include <mkl_service.h>
+#endif
+
+#include "Config.h"
+#include "Log.h"
+#include "CompressedTensor.h"
+#include "CLHelper.h"
+
+namespace Conv {
+  
+const unsigned int chars_per_datum = sizeof(Conv::datum)/sizeof(char);
+
+CompressedTensor::CompressedTensor() {
+
+}
+
+CompressedTensor::~CompressedTensor() {
+  DeleteIfPossible();
+}
+
+void CompressedTensor::Compress(Tensor& tensor)
+{
+  std::size_t compressed_length = 0;
+  std::size_t uncompressed_elements = tensor.elements();
+#ifdef BUILD_OPENCL
+  tensor.MoveToCPU();
+#endif
+  
+  void* compressed_buffer = new char[2 * tensor.elements() * chars_per_datum + 2];
+  CompressedTensor::CompressData((void*)tensor.data_ptr(), uncompressed_elements, compressed_buffer, compressed_length);
+  
+  Resize(tensor.samples(), tensor.width(), tensor.height(), tensor.maps(), compressed_length, (char*)compressed_buffer, false);
+}
+
+void CompressedTensor::Decompress(Tensor& tensor, datum* preallocated_buffer)
+{
+  std::size_t compressed_length = compressed_length_;
+  std::size_t uncompressed_elements = 0;
+  datum* uncompressed_buffer = preallocated_buffer;
+  if(uncompressed_buffer == nullptr)
+    uncompressed_buffer = new datum[elements_];
+  
+  CompressedTensor::DecompressData(uncompressed_buffer, uncompressed_elements, compressed_data_ptr_, compressed_length);
+  
+  if(uncompressed_elements != elements_) {
+    FATAL("Decompressed size mismatch!");
+  }
+    
+  tensor.Resize(samples_, width_, height_, maps_, uncompressed_buffer, false);
+}
+
+
+void CompressedTensor::Resize ( const std::size_t samples, const std::size_t width,
+                      const std::size_t height, const std::size_t maps, const std::size_t compressed_length, char* const preallocated_memory, bool mmapped) {
+  // Delete the old allocation
+  DeleteIfPossible();
+
+  // Don't need to allocate zero memory
+  if ( compressed_length == 0 )
+    return;
+
+  if(preallocated_memory != nullptr) {
+    compressed_data_ptr_ = preallocated_memory;
+    mmapped_ = mmapped;
+  } else {
+    // Allocate
+    compressed_data_ptr_ = new char[compressed_length];
+  }
+
+  // Save configuration
+  samples_ = samples;
+  width_ = width;
+  height_ = height;
+  maps_ = maps;
+  elements_ = samples * width * height * maps;
+  compressed_length_ = compressed_length;
+}
+
+void CompressedTensor::Serialize ( std::ostream& output ) {
+  uint64_t samples = samples_;
+  uint64_t width = width_;
+  uint64_t height = height_;
+  uint64_t maps = maps_;
+  uint64_t compressed_length = compressed_length_;
+
+  output.write ( ( const char* ) &samples, sizeof ( uint64_t ) / sizeof ( char ) );
+  output.write ( ( const char* ) &width, sizeof ( uint64_t ) / sizeof ( char ) );
+  output.write ( ( const char* ) &height, sizeof ( uint64_t ) / sizeof ( char ) );
+  output.write ( ( const char* ) &maps, sizeof ( uint64_t ) / sizeof ( char ) );
+  output.write ( ( const char* ) &compressed_length, sizeof ( uint64_t ) / sizeof ( char ) );
+
+  if ( elements_ > 0 )
+    output.write ( ( const char* ) compressed_data_ptr_, compressed_length_);
+}
+
+void CompressedTensor::Deserialize ( std::istream& input , bool head_only, bool try_mmap, int fd) {
+  uint64_t samples = 0;
+  uint64_t width = 0;
+  uint64_t height = 0;
+  uint64_t maps = 0;
+  uint64_t compressed_length = 0;
+
+  if ( !input.good() )
+    LOGERROR << "Cannot deserialize from this stream!";
+
+  input.read ( ( char* ) &samples, sizeof ( uint64_t ) / sizeof ( char ) );
+  input.read ( ( char* ) &width, sizeof ( uint64_t ) / sizeof ( char ) );
+  input.read ( ( char* ) &height, sizeof ( uint64_t ) / sizeof ( char ) );
+  input.read ( ( char* ) &maps, sizeof ( uint64_t ) / sizeof ( char ) );
+  input.read ( ( char* ) &compressed_length, sizeof ( uint64_t ) / sizeof ( char ) );
+
+#ifdef BUILD_POSIX
+  if(!try_mmap || fd == 0)
+#endif
+    Resize ( samples, width, height, maps, compressed_length );
+    
+  if ( compressed_length > 0 && !head_only ) {
+#ifdef BUILD_POSIX
+    if(try_mmap && fd != 0) {
+      // Get page size
+      long int page_size = sysconf(_SC_PAGESIZE);
+      long int current_position = input.tellg();
+      long int offset_in_page = current_position % page_size;
+#ifdef BUILD_LINUX
+      void* target_mmap = mmap64(NULL, compressed_length + offset_in_page, PROT_READ, MAP_PRIVATE, fd, current_position - offset_in_page);
+#elif defined(BUILD_OSX)
+      // OS X is 64-bit by default
+      void* target_mmap = mmap(NULL, compressed_length + offset_in_page, PROT_READ, MAP_PRIVATE, fd, current_position - offset_in_page);
+#endif
+      if(target_mmap == MAP_FAILED) {
+        LOGERROR << "Memory map failed: " << errno;
+      }
+      original_mmap_ = target_mmap;
+      
+      target_mmap = (void*)(((long)target_mmap) + offset_in_page);
+      Resize(samples, width, height, maps, compressed_length, (char*)target_mmap, true);
+      input.seekg(compressed_length, std::ios::cur);
+    } else
+#endif
+      input.read ( ( char* ) compressed_data_ptr_, compressed_length);
+  }
+  else if(head_only)
+    input.seekg(compressed_length, std::ios::cur);
+}
+
+void CompressedTensor::DeleteIfPossible() {
+  if ( compressed_data_ptr_ != nullptr ) {
+#ifdef BUILD_POSIX
+    if(mmapped_) {
+      munmap((void*)original_mmap_, compressed_length_);
+      original_mmap_ = nullptr;
+      mmapped_ = false;
+    } else {
+#endif
+      delete[] compressed_data_ptr_;
+#ifdef BUILD_POSIX
+    }
+#endif
+
+    compressed_data_ptr_ = nullptr;
+  }
+
+  samples_ = 0;
+  width_ = 0;
+  height_ = 0;
+  maps_ = 0;
+  elements_ = samples_ * width_ * height_ * maps_;
+  compressed_length_ = 0;
+}
+
+void CompressedTensor::PrintStats() {
+
+}
+
+std::ostream& operator<< ( std::ostream& output, const CompressedTensor& tensor ) {
+  return output << "C(" << tensor.samples() << "s@" << tensor.width() <<
+         "x" << tensor.height() << "x" << tensor.maps() << "m)";
+}
+
+/*
+ * This is the compression part. Don't change this or you will break the file format.
+ */
+const unsigned char rl_marker = 'X';
+const unsigned char rl_doublemarker = 'X';
+const unsigned char rl_rle = 'Y';
+const unsigned int rl_bytes = 1;
+const unsigned int rl_max = (unsigned int)((1L << (8L * (unsigned long)rl_bytes)) - 3L);
+const unsigned int rl_min = 1 + (5 + rl_bytes) / chars_per_datum;
+
+void CompressedTensor::CompressData(void* uncompressed, const std::size_t& uncompressed_elements, void* compressed, std::size_t& compressed_length)
+{
+  std::size_t bytes_out = 0;
+  
+  Conv::datum last_symbol = 0;
+  std::size_t running_length = 0;
+  
+  unsigned char* output_ptr = (unsigned char*)compressed;
+  
+  const datum* data_ptr_const = (const datum*) uncompressed;
+  
+  for(std::size_t pos = 0; pos <= uncompressed_elements; pos++) {
+    Conv::datum current_symbol;
+    if(pos < uncompressed_elements) {
+      current_symbol = data_ptr_const[pos];
+      if(current_symbol == last_symbol) {
+        // Increase running length
+        running_length++;
+      }
+    } else {
+      // Force emission of last symbol
+    }
+    
+    
+    if(
+    // EOF reached
+    (pos == uncompressed_elements) ||
+    // Different symbol
+    (current_symbol != last_symbol) ||
+    // Maxmimum run length reached
+    (running_length == rl_max)) {
+        
+      // Emit...
+      if(running_length > 0 && running_length < rl_min) {
+        // Emit single symbol(s)
+        for(std::size_t r = 0; r < running_length; r++) {
+          for(std::size_t b = 0; b < chars_per_datum; b++) {
+            char char_to_emit = ((char*)&last_symbol)[b];
+            if(char_to_emit == rl_marker) {
+              // Emit escaped
+              *output_ptr = rl_marker;
+              output_ptr++; bytes_out++;
+              *output_ptr = rl_doublemarker;
+              output_ptr++; bytes_out++;
+            } else {
+              // Emit directly
+              *output_ptr = char_to_emit;
+              output_ptr++; bytes_out++;
+            }
+          }
+        }
+      } else if(running_length >= rl_min) {
+        // Emit encoded
+        *output_ptr = rl_marker;
+        output_ptr++; bytes_out++;
+        *output_ptr = rl_rle;
+        output_ptr++; bytes_out++;
+        
+        // Running length output
+        for(std::size_t b = 0; b < rl_bytes; b++) {
+          *output_ptr = (running_length >> ((rl_bytes - (b+1)) * 8)) & 0xFF;
+          output_ptr++; bytes_out++;
+        }
+        
+        for(std::size_t b = 0; b < chars_per_datum; b++) {
+          unsigned char char_to_emit = ((char*)&last_symbol)[b];
+          *output_ptr = char_to_emit;
+          output_ptr++; bytes_out++;
+        }
+      }
+        
+      // ...and reset
+      if(running_length == rl_max)
+        running_length = 0;
+      else
+        running_length = 1;
+    }
+      
+    last_symbol = current_symbol;
+  }
+  compressed_length = bytes_out;
+}
+
+void CompressedTensor::DecompressData(void* uncompressed, std::size_t& uncompressed_elements, void* compressed, const std::size_t& compressed_length)
+{
+  unsigned int bytes_out = 0;
+  unsigned char* output_ptr = (unsigned char*)uncompressed;
+  const unsigned char* input_ptr = (const unsigned char*)compressed;
+  
+  for(unsigned int pos = 0; pos < compressed_length; pos++) {
+    unsigned char current_symbol = input_ptr[pos];
+    if(current_symbol == rl_marker) {
+      pos++; current_symbol = input_ptr[pos];
+      if(current_symbol == rl_doublemarker) {
+        // Emit single marker
+        *output_ptr = rl_marker;
+        output_ptr++; bytes_out++;
+      } else if(current_symbol == rl_rle) {
+        unsigned int running_length = 0;
+        
+        // Running length input
+        for(unsigned int b = 0; b < rl_bytes; b++) {
+          pos++; current_symbol = input_ptr[pos];
+          running_length += current_symbol;
+          if((b+1) != rl_bytes)
+            running_length <<= 8;
+        }
+        
+        for(unsigned int r = 0; r < running_length; r++) {
+          for(unsigned int b = 0; b < chars_per_datum; b++) {
+            pos++; current_symbol = input_ptr[pos];
+            *output_ptr = current_symbol;
+            output_ptr++; bytes_out++;
+          }
+          pos -= chars_per_datum;
+        }
+        pos += chars_per_datum;
+      } else {
+        FATAL("Incorrect encoding!");
+      }
+    } else {
+      // Emit directly
+      *output_ptr = current_symbol;
+      output_ptr++; bytes_out++;
+    }
+  }
+  if(bytes_out % chars_per_datum != 0) {
+    FATAL("Compressed length wrong!");
+  }
+  uncompressed_elements = bytes_out / chars_per_datum;
+}
+
+
+}
diff --git a/src/util/CompressedTensorStream.cpp b/src/util/CompressedTensorStream.cpp
new file mode 100644
index 0000000..8f1d7f2
--- /dev/null
+++ b/src/util/CompressedTensorStream.cpp
@@ -0,0 +1,97 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */
+
+#include <iostream>
+#include <fstream>
+
+#ifdef BUILD_POSIX
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif
+
+#include "CompressedTensorStream.h"
+
+namespace Conv {
+  
+unsigned int CompressedTensorStream::LoadFile(std::string path)
+{
+  std::ifstream input_stream(path, std::ios::binary | std::ios::in);
+  if(!input_stream.good()) {
+    FATAL("Cannot open file: " << path);
+  }
+#ifdef BUILD_POSIX
+  int input_fd = open(path.c_str(), O_RDONLY);
+  if(input_fd < 0) {
+    FATAL("Cannot open file: " << path);
+  }
+#endif
+
+  uint64_t magic = 0;
+  input_stream.read((char*)&magic, sizeof(uint64_t)/sizeof(char));
+  
+  if(magic != CN24_CTS_MAGIC) {
+    FATAL("Wrong magic at start of stream!");
+  }
+
+  // Go through file
+  std::cout << std::endl << std::flush;
+  
+  while (!input_stream.eof()) {
+    CompressedTensor* tensor = new CompressedTensor();
+#ifdef BUILD_POSIX
+    tensor->Deserialize (input_stream, false, true, input_fd);
+#else
+    tensor->Deserialize (input_stream, false);
+#endif
+
+    if (tensor->elements() == 0)
+      break;
+    
+    if(tensor->elements() > max_elements_)
+      max_elements_ = tensor->elements();
+
+    tensors_.push_back(tensor);
+    std::cout << "." << std::flush;
+    input_stream.peek();
+  }
+  
+  temp_tensor_.Resize(1,max_elements_);
+  return 0;
+}
+
+bool CompressedTensorStream::CopySample(const unsigned int source, const std::size_t source_sample,
+                                   Conv::Tensor& target, const std::size_t target_sample)
+{
+  if(source < tensors_.size()) {
+    CompressedTensor* const ctensor = tensors_[source];
+    if(source_sample == 0 && ctensor->width() == target.width() && ctensor->height() == target.height() && ctensor->maps() == target.maps() && ctensor->samples() == 1) {
+      // This is a little hack for faster loading of certain datasets
+#ifdef BUILD_OPENCL
+      target.MoveToCPU();
+#endif
+      datum* old_data_ptr = temp_tensor_.data_ptr();
+      datum* direct_ptr = target.data_ptr(0, 0, 0, target_sample);
+      temp_tensor_.Resize(1, max_elements_, 1, 1, direct_ptr, false, true);
+      ctensor->Decompress(temp_tensor_, temp_tensor_.data_ptr());
+      
+      temp_tensor_.Resize(1, max_elements_, 1, 1, old_data_ptr, false, true);
+      return true;
+    } else {
+      ctensor->Decompress(temp_tensor_, temp_tensor_.data_ptr());
+      return Tensor::CopySample(temp_tensor_, source_sample, target, target_sample);
+    }
+  } else
+    return false;
+}
+
+}
+
+
+
+
+
diff --git a/src/util/FloatTensorStream.cpp b/src/util/FloatTensorStream.cpp
new file mode 100644
index 0000000..443fdcf
--- /dev/null
+++ b/src/util/FloatTensorStream.cpp
@@ -0,0 +1,65 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */
+
+#include <iostream>
+#include <fstream>
+
+#ifdef BUILD_POSIX
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif
+
+#include "FloatTensorStream.h"
+
+namespace Conv {
+  
+unsigned int FloatTensorStream::LoadFile(std::string path)
+{
+  std::ifstream input_stream(path, std::ios::binary | std::ios::in);
+  if(!input_stream.good()) {
+    FATAL("Cannot open file: " << path);
+  }
+#ifdef BUILD_POSIX
+  int input_fd = open(path.c_str(), O_RDONLY);
+  if(input_fd < 0) {
+    FATAL("Cannot open file: " << path);
+  }
+#endif
+
+  // Go through file
+  std::cout << std::endl << std::flush;
+  
+  while (!input_stream.eof()) {
+    Tensor* tensor = new Tensor();
+#ifdef BUILD_POSIX
+    tensor->Deserialize (input_stream, false, true, input_fd);
+#else
+    tensor->Deserialize (input_stream, false);
+#endif
+
+    if (tensor->elements() == 0)
+      break;
+
+    tensors_.push_back(tensor);
+    std::cout << "." << std::flush;
+    input_stream.peek();
+  }
+  return 0;
+}
+
+bool FloatTensorStream::CopySample(const unsigned int source, const std::size_t source_sample,
+                                   Conv::Tensor& target, const std::size_t target_sample)
+{
+  if(source < tensors_.size()) {
+    return Tensor::CopySample(*tensors_[source], source_sample, target, target_sample);
+  } else
+    return false;
+}
+
+}
+
diff --git a/src/util/GradientTester.cpp b/src/util/GradientTester.cpp
index 8cbf5df..df576d2 100644
--- a/src/util/GradientTester.cpp
+++ b/src/util/GradientTester.cpp
@@ -11,7 +11,7 @@
 
 namespace Conv {
   
-void GradientTester::TestGradient ( NetGraph& graph ) {
+void GradientTester::TestGradient ( NetGraph& graph, unsigned int skip_weights, bool fatal_fail ) {
   const double epsilon = 0.005;
   LOGDEBUG << "Testing gradient. FeedForward...";
 	graph.FeedForward();
@@ -37,8 +37,10 @@ void GradientTester::TestGradient ( NetGraph& graph ) {
       unsigned int okay = 0;
       unsigned int tolerable = 0;
       unsigned int failed = 0;
-      for(unsigned int e = 0; e < param->data.elements(); e++)
+      unsigned int total = 0;
+      for(unsigned int e = 0; e < param->data.elements(); e+=(skip_weights + 1))
       {
+        total++;
 #ifdef BUILD_OPENCL
 	param->data.MoveToCPU();
 	param->delta.MoveToCPU();
@@ -83,23 +85,27 @@ graph.FeedForward();
       }
       // std::cout << "\n";
       if(passed) {
-	LOGINFO << "Okay!";
+	LOGDEBUG << "Okay!";
       } else {
 	LOGERROR << "Failed!";
       }
-			LOGINFO << okay << " of " << param->data.elements() << " gradients okay (delta < 2%)";
-			LOGINFO << tolerable << " of " << param->data.elements() << " gradients tolerable (delta < 20%)";
-			LOGINFO << failed << " of " << param->data.elements() << " gradients failed (delta >= 20%)";
+			LOGDEBUG << okay << " of " << total << " gradients okay (delta < 2%)";
+			LOGDEBUG << tolerable << " of " << total << " gradients tolerable (delta < 20%)";
+			LOGDEBUG << failed << " of " << total << " gradients failed (delta >= 20%)";
 			global_okay += okay;
 			global_tolerable += tolerable;
 			global_failed += failed;
-			global_weights += param->data.elements();
+			global_weights += total;
     }
   }
 
-	LOGINFO << global_okay << " of " << global_weights << " gradients okay (delta < 2%)";
-	LOGINFO << global_tolerable << " of " << global_weights << " gradients tolerable (delta < 20%)";
-	LOGINFO << global_failed << " of " << global_weights << " gradients failed (delta >= 20%)";
+	LOGRESULT << global_okay << " of " << global_weights << " tested gradients okay (delta < 2%)" << LOGRESULTEND;
+	LOGRESULT << global_tolerable << " of " << global_weights << " tested gradients tolerable (delta < 20%)" << LOGRESULTEND;
+	LOGRESULT << global_failed << " of " << global_weights << " tested gradients failed (delta >= 20%)" << LOGRESULTEND;
+  
+  if (global_failed > 0 && fatal_fail) {
+    FATAL("Failed gradient check!");
+  }
 
 }
 
diff --git a/src/util/Init.cpp b/src/util/Init.cpp
index bffc3ce..c5a460b 100644
--- a/src/util/Init.cpp
+++ b/src/util/Init.cpp
@@ -34,6 +34,7 @@
 #endif
 
 #include "TensorViewer.h"
+#include "StatAggregator.h"
 
 namespace Conv {
 
@@ -66,9 +67,12 @@ cl_kernel CLHelper::k_setValue = 0;
 cl_kernel CLHelper::k_sms = 0;
 cl_kernel CLHelper::k_im2col = 0;
 cl_kernel CLHelper::k_col2im = 0;
+cl_kernel CLHelper::k_up = 0;
+cl_kernel CLHelper::k_down = 0;
 #endif
 
 TensorViewer* System::viewer = nullptr;
+StatAggregator* System::stat_aggregator = nullptr;
 int System::log_level = 0;
 
 #define STRING_SHA1 GIT_SHA1
@@ -83,7 +87,7 @@ void System::Init(int requested_log_level) {
   } else
     log_level = requested_log_level;
   
-  LOGINFO << "CN24 version " STRING_SHA1;
+  LOGINFO << "CN24 v2.0.0 at " STRING_SHA1;
   LOGINFO << "Copyright (C) 2015 Clemens-Alexander Brust";
   LOGINFO << "For licensing information, see the LICENSE"
           << " file included with this project.";
@@ -125,7 +129,12 @@ void System::Init(int requested_log_level) {
     LOGWARN << "Could not initialize GTK!";
   }
 #endif
+
+  // Initialize global TensorViewer
   viewer = new TensorViewer();
+  
+  // Initialize global StatAggregator
+  stat_aggregator = new StatAggregator();
 }
 
 void System::GetExecutablePath(std::string& binary_path) {
@@ -257,6 +266,7 @@ void CLHelper::Init(unsigned int platform_number, unsigned int device_number) {
   cl_program p_maximum = CreateProgram ( "kernels/maximumPooling.cl" );
   cl_program p_amaximum = CreateProgram ( "kernels/advmaximumPooling.cl" );
   cl_program p_nonLinearFunctions = CreateProgram ( "kernels/nonLinearFunctions.cl" );
+  cl_program p_scaling = CreateProgram ( "kernels/scaling.cl" );
   cl_program p_setValue = CreateProgram ( "kernels/setValue.cl" );
   cl_program p_sms = CreateProgram ( "kernels/sms.cl" );
   cl_program p_im2col = CreateProgram ( "kernels/im2col.cl" );
@@ -392,6 +402,18 @@ void CLHelper::Init(unsigned int platform_number, unsigned int device_number) {
   if ( error != CL_SUCCESS ) {
     FATAL ( "Error creating kernel: " << ( signed int ) error );
   }
+  
+  k_up = clCreateKernel ( p_scaling, "UP", &error );
+
+  if ( error != CL_SUCCESS ) {
+    FATAL ( "Error creating kernel: " << ( signed int ) error );
+  }
+  
+  k_down = clCreateKernel ( p_scaling, "DOWN", &error );
+
+  if ( error != CL_SUCCESS ) {
+    FATAL ( "Error creating kernel: " << ( signed int ) error );
+  }
 #ifdef BUILD_CLBLAS
   cl_int err = clblasSetup();
   if (err!=CL_SUCCESS)
diff --git a/src/util/StatAggregator.cpp b/src/util/StatAggregator.cpp
new file mode 100644
index 0000000..e2aed6e
--- /dev/null
+++ b/src/util/StatAggregator.cpp
@@ -0,0 +1,153 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */
+
+#include "StatAggregator.h"
+#include "StatSink.h"
+
+#include <chrono>
+
+namespace Conv {
+
+unsigned int StatAggregator::RegisterSink(StatSink* stat_sink)
+{
+  stat_sinks_.push_back(stat_sink);
+  return stat_sink_count_++;
+}
+
+unsigned int StatAggregator::RegisterStat(StatDescriptor* stat_descriptor)
+{
+  stat_descriptors_.push_back(stat_descriptor);
+  stat_descriptor->stat_id = stat_descriptor_count_;
+  return stat_descriptor_count_++;
+}
+
+void StatAggregator::Initialize()
+{
+  if(state_!=INIT)
+    return;
+  
+  // Initialize all StatSinks
+  for(unsigned int s = 0; s < stat_sink_count_; s++) {
+    stat_sinks_[s]->Initialize(stat_descriptors_);
+  }
+  
+  // Initialize all statistics
+  for(unsigned int s = 0; s < stat_descriptor_count_; s++) {
+    Stat stat;
+    stats_.push_back(stat);
+  }
+  state_ = STOPPED;
+  
+  // Reset statistics
+  Reset();
+  
+  // Send SetCurrentExperiment "message" to all StatSinks at least once before processing
+  SetCurrentExperiment(hardcoded_stats_.current_experiment);
+}
+
+void StatAggregator::Generate()
+{
+  std::vector<Stat*> output_stats;
+  
+  for(unsigned int s = 0; s < stat_descriptor_count_; s++) {
+    // We will not check for output_function's validity. We need its output.
+    Stat* output_stat = new Stat;
+    *output_stat = stat_descriptors_[s]->output_function(hardcoded_stats_, stats_[s]);
+    output_stats.push_back(output_stat);
+  }
+  
+  // Call all StatSinks' Process method
+  for(unsigned int s = 0; s < stat_sink_count_; s++) {
+    stat_sinks_[s]->Process(hardcoded_stats_, output_stats);
+  }
+  
+  
+  // Free all the allocated memory
+  for(unsigned int s = 0; s < stat_descriptor_count_; s++) {
+    delete (output_stats[s]);
+  }
+}
+
+
+void StatAggregator::Update(unsigned int stat_id, double user_value)
+{
+  // Ignore this call if not recording
+  if(state_ != RECORDING)
+    return;
+  
+  if(stat_id < stat_descriptor_count_) {
+    // We will not check for validity because we provided an initial function.
+    stat_descriptors_[stat_id]->update_function(stats_[stat_id], user_value);
+  }
+}
+
+void StatAggregator::Reset()
+{
+  // Ignore this call if recording
+  if(state_ != STOPPED)
+    return;
+  
+  hardcoded_stats_.Reset();
+  
+  // Reset non-hardcoded stats
+  for(unsigned int s = 0; s < stat_descriptor_count_; s++) {
+    // We will not check for validity because we provided an initial function.
+    stat_descriptors_[s]->init_function(stats_[s]);
+  }
+}
+
+void StatAggregator::StartRecording()
+{
+  // Ignore this call if already recording
+  if(state_ != STOPPED)
+    return;
+  
+  // Record start time
+  start_time_ = std::chrono::system_clock::now();
+  
+  state_ = RECORDING;
+}
+
+void StatAggregator::StopRecording()
+{
+  // Ignore this call if not recording
+  if(state_ != RECORDING)
+    return;
+  
+  // Record stopping time
+  auto stop_time = std::chrono::system_clock::now();
+  
+  // Update elapsed time
+  std::chrono::duration<double> t_diff = stop_time - start_time_;
+  hardcoded_stats_.seconds_elapsed += t_diff.count();
+  
+  state_ = STOPPED;
+}
+
+void StatAggregator::Snapshot() {
+  // Ignore this call if not recording
+  if (state_ != RECORDING)
+    return;
+
+  StopRecording();
+  Generate();
+  Reset();
+  StartRecording();
+}
+  
+void StatAggregator::SetCurrentExperiment(std::string current_experiment) {
+  // Only change experiment name when not recording and already initialized
+  if(state_!=STOPPED)
+    return;
+  
+  // Call all StatSinks' SetCurrentExperiment method
+  for(unsigned int s = 0; s < stat_sink_count_; s++) {
+    stat_sinks_[s]->SetCurrentExperiment(current_experiment);
+  }
+}
+  
+}
\ No newline at end of file
diff --git a/src/util/Tensor.cpp b/src/util/Tensor.cpp
index 7203dbb..7e14f60 100644
--- a/src/util/Tensor.cpp
+++ b/src/util/Tensor.cpp
@@ -129,13 +129,14 @@ void Tensor::Shadow ( Tensor& tensor ) {
 
 
 void Tensor::Resize ( const std::size_t samples, const std::size_t width,
-                      const std::size_t height, const std::size_t maps, datum* const preallocated_memory, bool mmapped) {
+                      const std::size_t height, const std::size_t maps, datum* const preallocated_memory, bool mmapped, bool dont_delete) {
   // Check if reshaping works
-  if ( Reshape ( samples, width, height, maps ) )
+  if (preallocated_memory == nullptr && Reshape ( samples, width, height, maps ) )
     return;
 
-  // Delete the old allocation
-  DeleteIfPossible();
+  // Delete the old allocation if it is different from the new one
+  if(preallocated_memory != data_ptr_ && !dont_delete)
+    DeleteIfPossible();
 
   // Calculate memory requirement
   std::size_t elements = samples * maps * width * height;
@@ -393,17 +394,21 @@ bool Tensor::CopyMap ( const Tensor& source, const std::size_t source_sample,
 void Tensor::DeleteIfPossible() {
   if ( data_ptr_ != nullptr ) {
     if ( !is_shadow_ ) {
+#ifdef BUILD_POSIX
       if(mmapped_) {
         munmap((void*)original_mmap_, (elements_ * sizeof(datum)) / sizeof(char));
         original_mmap_ = nullptr;
         mmapped_ = false;
       } else {
+#endif
 #ifdef BLAS_MKL
         mkl_free ( data_ptr_ );
 #else
         delete[] data_ptr_;
 #endif
+#ifdef BUILD_POSIX
       }
+#endif
 #ifdef BUILD_OPENCL
       if ( cl_data_ptr_ != 0 ) {
         clReleaseMemObject ( (cl_mem)cl_data_ptr_ );
diff --git a/src/util/TensorStream.cpp b/src/util/TensorStream.cpp
new file mode 100644
index 0000000..2977d27
--- /dev/null
+++ b/src/util/TensorStream.cpp
@@ -0,0 +1,47 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */
+
+#include <iostream>
+#include <fstream>
+#include <string>
+
+#ifdef BUILD_POSIX
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif
+
+#include "TensorStream.h"
+#include "FloatTensorStream.h"
+#include "CompressedTensorStream.h"
+
+namespace Conv {
+  
+TensorStream* TensorStream::FromFile(std::string path) {
+  std::ifstream input_stream(path, std::ios::in | std::ios::binary);
+  if(!input_stream.good()) {
+    FATAL("Cannot open file: " << path);
+  }
+  uint64_t magic = 0;
+  
+  input_stream.read((char*)&magic, sizeof(uint64_t)/sizeof(char));
+  input_stream.close();
+  
+  if(magic == CN24_CTS_MAGIC) {
+    LOGDEBUG << "Is compressed tensor, loading...";
+    CompressedTensorStream* cts = new CompressedTensorStream();
+    cts->LoadFile(path);
+    return cts;
+  } else {
+    LOGDEBUG << "Is float tensor, loading...";
+    FloatTensorStream* fts = new FloatTensorStream();
+    fts->LoadFile(path);
+    return fts;
+  }
+}
+
+}
\ No newline at end of file
diff --git a/src/util/TensorStreamDataset.cpp b/src/util/TensorStreamDataset.cpp
index 6227b50..377e9c4 100644
--- a/src/util/TensorStreamDataset.cpp
+++ b/src/util/TensorStreamDataset.cpp
@@ -23,16 +23,24 @@
 #include "KITTIData.h"
 #include "ConfigParsing.h"
 
+#include "FloatTensorStream.h"
+
 namespace Conv {
 
-TensorStreamDataset::TensorStreamDataset (std::istream& training_stream,
+TensorStreamDataset::TensorStreamDataset (
+    /*
+    std::istream& training_stream,
     std::istream& testing_stream,
+    */
+    TensorStream* training_stream,
+    TensorStream* testing_stream,
     unsigned int classes,
     std::vector< std::string > class_names,
     std::vector<unsigned int> class_colors,
 		std::vector<datum> class_weights,
     dataset_localized_error_function error_function,
     int training_fd, int testing_fd ) :
+    training_stream_(training_stream), testing_stream_(testing_stream),
   classes_ (classes), class_names_ (class_names), class_colors_ (class_colors),
 	class_weights_(class_weights),
   error_function_ (error_function) {
@@ -43,21 +51,11 @@ TensorStreamDataset::TensorStreamDataset (std::istream& training_stream,
     FATAL ("Class count does not match class information count!");
   }
 
+  
   // Count tensors
   Tensor tensor;
 
-  while (!training_stream.eof()) {
-    tensor.Deserialize (training_stream, true);
-
-    if (tensor.elements() == 0)
-      break;
-
-    // LOGDEBUG << "Tensor " << tensor_count_training_ << ": " << tensor;
-    tensor_count_training_++;
-
-    training_stream.peek();
-  }
-
+  tensor_count_training_ = training_stream_->GetTensorCount();
   LOGDEBUG << tensor_count_training_  / 2 << " training tensors";
 
   // We need alternating label and image tensors, so we need an even count
@@ -65,18 +63,7 @@ TensorStreamDataset::TensorStreamDataset (std::istream& training_stream,
     FATAL ("Odd training tensor count!");
   }
 
-  while (!testing_stream.eof()) {
-    tensor.Deserialize (testing_stream, true);
-
-    if (tensor.elements() == 0)
-      break;
-
-    // LOGDEBUG << "Tensor " << tensor_count_testing_ << ": " << tensor;
-    tensor_count_testing_++;
-
-    testing_stream.peek();
-  }
-
+  tensor_count_testing_ = testing_stream->GetTensorCount();
   LOGDEBUG << tensor_count_testing_ / 2 << " testing tensors";
 
   if (tensor_count_testing_ & 1) {
@@ -85,56 +72,29 @@ TensorStreamDataset::TensorStreamDataset (std::istream& training_stream,
 
   tensors_ = (tensor_count_testing_ + tensor_count_training_) / 2;
 
-  // Reset streams
-  training_stream.clear();
-  testing_stream.clear();
-  training_stream.seekg (0, std::ios::beg);
-  testing_stream.seekg (0, std::ios::beg);
-
-  // Allocate arrays that depend on the tensor count
-  if (tensors_ > 0) {
-    data_ = new Tensor[tensors_];
-    labels_ = new Tensor[tensors_];
-  } else {
-    data_ = new Tensor[1];
-    labels_ = new Tensor[1];
-  }
-
   // Read tensors
   unsigned int e = 0;
   max_width_ = 0;
   max_height_ = 0;
   
   if((tensor_count_training_ + tensor_count_testing_) > 0) {
-    LOGINFO << "Deserializing " << (tensor_count_training_ + tensor_count_testing_) / 2 << " Tensors..." << std::endl << std::flush;
+    LOGINFO << "Loaded " << (tensor_count_training_ + tensor_count_testing_) / 2 << " Tensors.";
   }
 
   for (unsigned int t = 0; t < (tensor_count_training_ / 2); t++) {
-    data_[t].Deserialize (training_stream, false, true, training_fd);
-
-    if (data_[t].width() > max_width_)
-      max_width_ = data_[t].width();
-
-    if (data_[t].height() > max_height_)
-      max_height_ = data_[t].height();
-
-    labels_[t].Deserialize (training_stream, false, true, training_fd);
+    if(training_stream_->GetWidth(2*t) > max_width_)
+      max_width_ = training_stream_->GetWidth(2*t);
     
-    std::cout << "." << std::flush;
+    if(training_stream_->GetHeight(2*t) > max_height_)
+      max_height_ = training_stream_->GetHeight(2*t);
   }
 
   for (unsigned int t = (tensor_count_training_ / 2) ; t < tensors_; t++) {
-    data_[t].Deserialize (testing_stream, false, true, testing_fd);
-
-    if (data_[t].width() > max_width_)
-      max_width_ = data_[t].width();
-
-    if (data_[t].height() > max_height_)
-      max_height_ = data_[t].height();
-
-    labels_[t].Deserialize (testing_stream, false, true, testing_fd);
+    if(testing_stream_->GetWidth(2*t) > max_width_)
+      max_width_ = testing_stream_->GetWidth(2*t);
     
-    std::cout << "." << std::flush;
+    if(testing_stream_->GetHeight(2*t) > max_height_)
+      max_height_ = testing_stream_->GetHeight(2*t);
   }
 
   if (max_width_ & 1)
@@ -167,8 +127,13 @@ TensorStreamDataset::TensorStreamDataset (std::istream& training_stream,
   if (max_height_ & 32)
     max_height_+=32;
   
-  input_maps_ = data_[0].maps();
-  label_maps_ = labels_[0].maps();
+  if(training_stream_->GetTensorCount() > 0) {
+    input_maps_ = training_stream_->GetMaps(0);
+    label_maps_ = training_stream_->GetMaps(1);
+  } else {
+    input_maps_ = testing_stream_->GetMaps(0);
+    label_maps_ = testing_stream_->GetMaps(1);
+  }
 
   // Prepare error cache
   error_cache.Resize (1, max_width_, max_height_, 1);
@@ -233,37 +198,41 @@ bool TensorStreamDataset::SupportsTesting() const {
 bool TensorStreamDataset::GetTrainingSample (Tensor& data_tensor, Tensor& label_tensor, Tensor& helper_tensor, Tensor& weight_tensor, unsigned int sample, unsigned int index) {
   if (index < tensor_count_training_ / 2) {
     bool success = true;
-    success &= Tensor::CopySample (data_[index], 0, data_tensor, sample);
-    success &= Tensor::CopySample (labels_[index], 0, label_tensor, sample);
+    success &= training_stream_->CopySample(2 * index, 0, data_tensor, sample);
+    success &= training_stream_->CopySample(2 * index + 1, 0, label_tensor, sample);
 
+    unsigned int data_width = training_stream_->GetWidth(2 * index);
+    unsigned int data_height = training_stream_->GetHeight(2 * index);
+    
 		// Write spatial prior data to helper tensor
-		for (unsigned int y = 0; y < data_[index].height(); y++) {
-			for (unsigned int x = 0; x < data_[index].width(); x++) {
-				*helper_tensor.data_ptr(x, y, 0, sample) = ((datum)x) / ((datum)data_[index].width() - 1);
-				*helper_tensor.data_ptr(x, y, 1, sample) = ((datum)y) / ((datum)data_[index].height() - 1);
+		for (unsigned int y = 0; y < data_height; y++) {
+			for (unsigned int x = 0; x < data_width; x++) {
+				*helper_tensor.data_ptr(x, y, 0, sample) = ((datum)x) / ((datum)data_width - 1);
+				*helper_tensor.data_ptr(x, y, 1, sample) = ((datum)y) / ((datum)data_height - 1);
 			}
-			for (unsigned int x = data_[index].width(); x < GetWidth(); x++) {
+			for (unsigned int x = data_width; x < GetWidth(); x++) {
 				*helper_tensor.data_ptr(x, y, 0, sample) = 0;
 				*helper_tensor.data_ptr(x, y, 1, sample) = 0;
 			}
 		}
-		for (unsigned int y = data_[index].height(); y < GetHeight(); y++) {
+		for (unsigned int y = data_height; y < GetHeight(); y++) {
 			for (unsigned int x = 0; x < GetWidth(); x++) {
 				*helper_tensor.data_ptr(x, y, 0, sample) = 0;
 				*helper_tensor.data_ptr(x, y, 1, sample) = 0;
 			}
 		}
 
-    //if (data_[index].width() == GetWidth() && data_[index].height() == GetHeight()) {
+    //if (data_width == GetWidth() && data_height == GetHeight()) {
     //  success &= Tensor::CopySample (error_cache, 0, weight_tensor, sample);
     //} else {
       // Reevaluate error function
       weight_tensor.Clear (0.0, sample);
 
-      for (unsigned int y = 0; y < data_[index].height(); y++) {
-        for (unsigned int x = 0; x < data_[index].width(); x++) {
+      #pragma omp parallel for default(shared)
+      for (unsigned int y = 0; y < data_height; y++) {
+        for (unsigned int x = 0; x < data_width; x++) {
 					const datum class_weight = class_weights_[label_tensor.PixelMaximum(x, y, sample)];
-          *weight_tensor.data_ptr (x, y, 0, sample) = error_function_ (x, y, data_[index].width(), data_[index].height()) * class_weight;
+          *weight_tensor.data_ptr (x, y, 0, sample) = error_function_ (x, y, data_width, data_height) * class_weight;
         }
       }
     //}
@@ -275,38 +244,41 @@ bool TensorStreamDataset::GetTrainingSample (Tensor& data_tensor, Tensor& label_
 bool TensorStreamDataset::GetTestingSample (Tensor& data_tensor, Tensor& label_tensor, Tensor& helper_tensor, Tensor& weight_tensor, unsigned int sample, unsigned int index) {
   if (index < tensor_count_testing_ / 2) {
     bool success = true;
-    unsigned int test_index = (tensor_count_training_ / 2) + index;
-    success &= Tensor::CopySample (data_[test_index], 0, data_tensor, sample);
-    success &= Tensor::CopySample (labels_[test_index], 0, label_tensor, sample);
+    success &= testing_stream_->CopySample(2 * index, 0, data_tensor, sample);
+    success &= testing_stream_->CopySample(2 * index + 1, 0, label_tensor, sample);
+    
+    unsigned int data_width = testing_stream_->GetWidth(2 * index);
+    unsigned int data_height = testing_stream_->GetHeight(2 * index);
 
 		// Write spatial prior data to helper tensor
-		for (unsigned int y = 0; y < data_[test_index].height(); y++) {
-			for (unsigned int x = 0; x < data_[test_index].width(); x++) {
-				*helper_tensor.data_ptr(x, y, 0, sample) = ((datum)x) / ((datum)data_[test_index].width() - 1);
-				*helper_tensor.data_ptr(x, y, 1, sample) = ((datum)y) / ((datum)data_[test_index].height() - 1);
+		for (unsigned int y = 0; y < data_height; y++) {
+			for (unsigned int x = 0; x < data_width; x++) {
+				*helper_tensor.data_ptr(x, y, 0, sample) = ((datum)x) / ((datum)data_width - 1);
+				*helper_tensor.data_ptr(x, y, 1, sample) = ((datum)y) / ((datum)data_height - 1);
 			}
-			for (unsigned int x = data_[test_index].width(); x < GetWidth(); x++) {
+			for (unsigned int x = data_width; x < GetWidth(); x++) {
 				*helper_tensor.data_ptr(x, y, 0, sample) = 0;
 				*helper_tensor.data_ptr(x, y, 1, sample) = 0;
 			}
 		}
-		for (unsigned int y = data_[test_index].height(); y < GetHeight(); y++) {
+		for (unsigned int y = data_height; y < GetHeight(); y++) {
 			for (unsigned int x = 0; x < GetWidth(); x++) {
 				*helper_tensor.data_ptr(x, y, 0, sample) = 0;
 				*helper_tensor.data_ptr(x, y, 1, sample) = 0;
 			}
 		}
 
-    //if (data_[test_index].width() == GetWidth() && data_[test_index].height() == GetHeight()) {
+    //if (data_width == GetWidth() && data_height == GetHeight()) {
     //  success &= Tensor::CopySample (error_cache, 0, weight_tensor, sample);
     //} else {
       // Reevaluate error function
       weight_tensor.Clear (0.0, sample);
 
-      for (unsigned int y = 0; y < data_[test_index].height(); y++) {
-        for (unsigned int x = 0; x < data_[test_index].width(); x++) {
+      #pragma omp parallel for default(shared)
+      for (unsigned int y = 0; y < data_height; y++) {
+        for (unsigned int x = 0; x < data_width; x++) {
 					const datum class_weight = class_weights_[label_tensor.PixelMaximum(x, y, sample)];
-          *weight_tensor.data_ptr (x, y, 0, sample) = error_function_ (x, y, data_[test_index].width(), data_[test_index].height()) * class_weight;
+          *weight_tensor.data_ptr (x, y, 0, sample) = error_function_ (x, y, data_width, data_height) * class_weight;
         }
       }
     //}
@@ -326,6 +298,9 @@ TensorStreamDataset* TensorStreamDataset::CreateFromConfiguration (std::istream&
   int training_fd = 0;
   int testing_fd = 0;
   bool no_mmap = false;
+  
+  TensorStream* training_stream = new FloatTensorStream();
+  TensorStream* testing_stream = new FloatTensorStream();
 
   file.clear();
   file.seekg (0, std::ios::beg);
@@ -403,39 +378,14 @@ TensorStreamDataset* TensorStreamDataset::CreateFromConfiguration (std::istream&
   LOGDEBUG << "Training tensor: " << training_file;
   LOGDEBUG << "Testing tensor: " << testing_file;
 
-  std::istream* training_stream = nullptr;
-  std::istream* testing_stream = nullptr;
-
   if (!dont_load && (selection == LOAD_BOTH || selection == LOAD_TRAINING_ONLY) && training_file.length() > 0) {
-    training_stream = new std::ifstream (training_file, std::ios::in | std::ios::binary);
-    if(!training_stream->good()) {
-      FATAL("Failed to load " << training_file << "!");
-    }
-#ifdef BUILD_POSIX
-    if(!no_mmap)
-      training_fd = open(training_file.c_str(), O_RDONLY);
-    if(training_fd < 0) {
-      FATAL("Failed to load " << training_file << "!");
-    }
-#endif
+    training_stream = TensorStream::FromFile(training_file);
   } else {
-    training_stream = new std::istringstream();
   }
 
   if (!dont_load && (selection == LOAD_BOTH || selection == LOAD_TESTING_ONLY) && testing_file.length() > 0) {
-    testing_stream = new std::ifstream (testing_file, std::ios::in | std::ios::binary);
-    if(!testing_stream->good()) {
-      FATAL("Failed to load " << testing_file << "!");
-    }
-#ifdef BUILD_POSIX
-    if(!no_mmap)
-      testing_fd = open(training_file.c_str(), O_RDONLY);
-    if(testing_fd < 0) {
-      FATAL("Failed to load " << testing_file << "!");
-    }
-#endif
+    testing_stream = TensorStream::FromFile(testing_file);
   } else {
-    testing_stream = new std::istringstream();
   }
 
 	if (class_weights.size() != classes) {
@@ -443,7 +393,7 @@ TensorStreamDataset* TensorStreamDataset::CreateFromConfiguration (std::istream&
 			class_weights.push_back(1.0);
 	}
 	
-  return new TensorStreamDataset (*training_stream, *testing_stream, classes,
+  return new TensorStreamDataset (training_stream, testing_stream, classes,
                                   class_names, class_colors, class_weights, error_function, training_fd, testing_fd);
 }
 
diff --git a/tools/compressTensorStream.cpp b/tools/compressTensorStream.cpp
new file mode 100644
index 0000000..fc02079
--- /dev/null
+++ b/tools/compressTensorStream.cpp
@@ -0,0 +1,72 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */  
+
+#include <cn24.h>
+
+#include <iostream>
+#include <fstream>
+
+int main(int argc, char** argv) {
+  Conv::System::Init();
+  
+  if(argc != 3) {
+    LOGERROR << "USAGE: " << argv[0] << " <input (uncompressed) tensor stream> <output (compressed) tensor stream>";
+  }
+  
+  std::string input_file_name(argv[1]);
+  std::string output_file_name(argv[2]);
+  
+  std::ifstream input_tensor_stream(input_file_name, std::ios::in | std::ios::binary);
+  std::ofstream output_tensor_stream(output_file_name, std::ios::out | std::ios::binary);
+  
+  if(!input_tensor_stream.good())
+    FATAL("Cannot open " << input_file_name);
+  
+  if(!output_tensor_stream.good())
+    FATAL("Cannot open " << output_file_name);
+  
+  long uncompressed_total = 0;
+  long compressed_total = 0;
+  
+  Conv::Tensor tensor;
+  
+  uint64_t magic = CN24_CTS_MAGIC;
+  output_tensor_stream.write((char*)&magic, sizeof(uint64_t)/sizeof(char));
+  
+  while(!input_tensor_stream.eof()) {
+    tensor.Deserialize(input_tensor_stream);
+    
+    LOGDEBUG << "Input tensor: " << tensor;
+    
+    unsigned int original_size = tensor.elements() * sizeof(Conv::datum)/sizeof(char);
+    LOGDEBUG << "Size: " << original_size;
+    
+    Conv::CompressedTensor ctensor;
+    ctensor.Compress(tensor);
+    
+    ctensor.Serialize(output_tensor_stream);
+    
+    LOGDEBUG << "RLE Size: " << ctensor.compressed_length();
+    
+    ctensor.Decompress(tensor);
+    unsigned int bytes_out = tensor.elements() * sizeof(Conv::datum)/sizeof(char);
+    
+    if(bytes_out != original_size) {
+      FATAL("Size mismatch! Expected: " << (tensor.elements() * sizeof(Conv::datum)/sizeof(char)) << ", actual: " << bytes_out);
+    }
+    
+    LOGINFO << "Ratio: " << 100.0 * (double)ctensor.compressed_length() / (double)(tensor.elements() * sizeof(Conv::datum)/sizeof(char)) << "%" << std::flush;
+    compressed_total += ctensor.compressed_length();
+    uncompressed_total += tensor.elements() * sizeof(Conv::datum)/sizeof(char);
+    
+    input_tensor_stream.peek();
+  }
+  LOGINFO << "Overall ratio: " << 100.0 * (double)compressed_total / (double)uncompressed_total << "%";
+  LOGINFO << "Uncompressed: " << uncompressed_total;
+  LOGINFO << "Compressed  : " << compressed_total;
+  LOGEND;
+}
\ No newline at end of file
diff --git a/tools/makeCompressedTensorStream.cpp b/tools/makeCompressedTensorStream.cpp
new file mode 100644
index 0000000..1f9d597
--- /dev/null
+++ b/tools/makeCompressedTensorStream.cpp
@@ -0,0 +1,195 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */
+/**
+ * @file makeCompressedTensorStream.cpp
+ * @brief Tool to import datasets
+ *
+ * @author Clemens-Alexander Brust (ikosa dot de at gmail dot com)
+ */
+
+#include <iostream>
+#include <fstream>
+
+#include <cn24.h>
+
+int main ( int argc, char** argv ) {
+  if ( argc < 8 ) {
+    LOGERROR << "USAGE: " << argv[0] << " <dataset config file> <image list file> <image directory> <label list file> <label directory> <output file> <true/false for direct RGB of labels>";
+    LOGEND;
+    return -1;
+  }
+  
+  Conv::System::Init(3);
+
+  // Capture command line arguments
+  std::string directRGB ( argv[7] );
+  std::string output_fname ( argv[6] );
+  std::string label_directory ( argv[5] );
+  std::string label_list_fname ( argv[4] );
+  std::string image_directory ( argv[3] );
+  std::string image_list_fname ( argv[2] );
+  std::string dataset_config_fname ( argv[1] );
+
+  if(image_directory.back() != '/')
+    image_directory += "/";
+
+  if(label_directory.back() != '/')
+    label_directory += "/";
+
+  // Open dataset configuration files
+  std::ifstream dataset_config_file ( dataset_config_fname,std::ios::in );
+
+  if ( !dataset_config_file.good() ) {
+    FATAL ( "Cannot open dataset configuration file!" );
+  }
+
+  LOGINFO << "Loading dataset";
+  // Load dataset
+  Conv::TensorStreamDataset* dataset = Conv::TensorStreamDataset::CreateFromConfiguration ( dataset_config_file, true );
+
+  unsigned int number_of_classes = dataset->GetClasses();
+  // arrays to store class colors in an easy to index way
+  Conv::datum* cr = new Conv::datum[number_of_classes];
+  Conv::datum* cg = new Conv::datum[number_of_classes];
+  Conv::datum* cb = new Conv::datum[number_of_classes];
+
+  for ( unsigned int c = 0; c < number_of_classes; c++ ) {
+    const unsigned int class_color = dataset->GetClassColors() [c];
+    cr[c] = DATUM_FROM_UCHAR ( ( class_color >> 16 ) & 0xFF );
+    cg[c] = DATUM_FROM_UCHAR ( ( class_color >> 8 ) & 0xFF );
+    cb[c] = DATUM_FROM_UCHAR ( class_color & 0xFF );
+  }
+
+  // Open file lists
+  std::ifstream image_list_file ( image_list_fname, std::ios::in );
+
+  if ( !image_list_file.good() ) {
+    FATAL ( "Cannot open image list file!" );
+  }
+
+  std::ifstream label_list_file ( label_list_fname, std::ios::in );
+
+  if ( !label_list_file.good() ) {
+    FATAL ( "Cannot open label list file!" );
+  }
+
+  // Open output file
+  std::ofstream output_file ( output_fname, std::ios::out | std::ios::binary );
+
+  if ( !output_file.good() ) {
+    FATAL ( "Cannot open output file!" );
+  }
+  
+  
+  uint64_t magic = CN24_CTS_MAGIC;
+  output_file.write((char*)&magic, sizeof(uint64_t)/sizeof(char));
+
+  // Iterate through lists of images and labels
+  while ( !image_list_file.eof() ) {
+    std::string image_fname;
+    std::string label_fname;
+    std::getline ( image_list_file, image_fname );
+    std::getline ( label_list_file, label_fname );
+
+    if ( image_fname.length() < 5 || label_fname.length() < 5 )
+      break;
+
+    LOGINFO << "Importing files " << image_fname << " and " << label_fname << "...";
+    Conv::Tensor image_tensor ( image_directory + image_fname );
+    Conv::Tensor label_rgb_tensor ( label_directory + label_fname );
+
+    if ( image_tensor.width() != label_rgb_tensor.width() ||
+         image_tensor.height() != label_rgb_tensor.height() ) {
+      LOGERROR << "Dimensions don't match, skipping file!";
+      continue;
+    }
+ 
+    int label_tensor_width = number_of_classes; 
+    if(directRGB == "true") {
+      label_tensor_width = 3;
+    }
+	
+    Conv::Tensor label_tensor ( 1, label_rgb_tensor.width(), label_rgb_tensor.height(), label_tensor_width);
+
+    if(directRGB == "true") {
+      // no classes - interpret the label tensor input as the output (no class/color mapping)
+       for ( unsigned int y = 0; y < label_rgb_tensor.height(); y++ ) {
+        for ( unsigned int x = 0; x < label_rgb_tensor.width(); x++ ) {     
+          *label_tensor.data_ptr ( x,y,0,0 ) = *label_rgb_tensor.data_ptr_const ( x,y,0,0 );
+          *label_tensor.data_ptr ( x,y,1,0 ) = *label_rgb_tensor.data_ptr_const ( x,y,1,0 );
+          *label_tensor.data_ptr ( x,y,2,0 ) = *label_rgb_tensor.data_ptr_const ( x,y,2,0 );
+        }
+      }
+    } else if(number_of_classes == 1) {
+      // 1 class - convert RGB images into multi-channel label tensors
+      const unsigned int foreground_color = dataset->GetClassColors() [0];
+      const Conv::datum fr = DATUM_FROM_UCHAR ( ( foreground_color >> 16 ) & 0xFF ),
+                        fg = DATUM_FROM_UCHAR ( ( foreground_color >> 8 ) & 0xFF ),
+                        fb = DATUM_FROM_UCHAR ( foreground_color & 0xFF );
+
+      for ( unsigned int y = 0; y < label_rgb_tensor.height(); y++ ) {
+        for ( unsigned int x = 0; x < label_rgb_tensor.width(); x++ ) {
+          Conv::datum lr, lg, lb;
+
+          if ( label_rgb_tensor.maps() == 3 ) {
+            lr = *label_rgb_tensor.data_ptr_const ( x,y,0,0 );
+            lg = *label_rgb_tensor.data_ptr_const ( x,y,1,0 );
+            lb = *label_rgb_tensor.data_ptr_const ( x,y,2,0 );
+          } else if ( label_rgb_tensor.maps() == 1 ) {
+            lr = *label_rgb_tensor.data_ptr_const ( x,y,0,0 );
+            lg = lr;
+            lb = lr;
+          } else {
+            FATAL ( "Unsupported input channel count!" );
+          }
+
+          const Conv::datum class1_diff = std::sqrt ( ( lr - fr ) * ( lr - fr )
+                                          + ( lg - fg ) * ( lg - fg )
+                                          + ( lb - fb ) * ( lb - fb ) ) / std::sqrt ( 3.0 );
+          const Conv::datum val = 1.0 - 2.0 * class1_diff;
+          *label_tensor.data_ptr ( x,y,0,0 ) = val;
+        }
+      }
+    } else {
+      // any number of other classes      
+      label_tensor.Clear ( 0.0 );
+
+      for ( unsigned int y = 0; y < label_rgb_tensor.height(); y++ ) {
+        for ( unsigned int x = 0; x < label_rgb_tensor.width(); x++ ) {
+          Conv::datum lr, lg, lb;
+
+          if ( label_rgb_tensor.maps() == 3 ) {
+            lr = *label_rgb_tensor.data_ptr_const ( x,y,0,0 );
+            lg = *label_rgb_tensor.data_ptr_const ( x,y,1,0 );
+            lb = *label_rgb_tensor.data_ptr_const ( x,y,2,0 );
+          } else if ( label_rgb_tensor.maps() == 1 ) {
+            lr = *label_rgb_tensor.data_ptr_const ( x,y,0,0 );
+            lg = lr;
+            lb = lr;
+          } else {
+            FATAL ( "Unsupported input channel count!" );
+          }
+
+          for ( unsigned int c = 0; c <number_of_classes; c++ ) {
+            if(lr == cr[c] && lg == cg[c] && lb == cb[c])
+              *label_tensor.data_ptr ( x,y,c,0 ) = 1.0;
+          }
+        }
+      }
+    } // end if
+
+    Conv::CompressedTensor compressed_image_tensor;
+    Conv::CompressedTensor compressed_label_tensor;
+    compressed_image_tensor.Compress(image_tensor);
+    compressed_label_tensor.Compress(label_tensor);
+    
+    compressed_image_tensor.Serialize ( output_file );
+    compressed_label_tensor.Serialize ( output_file );
+  }
+
+  LOGEND;
+}
diff --git a/tools/runBenchmark.cpp b/tools/runBenchmark.cpp
new file mode 100644
index 0000000..63d9912
--- /dev/null
+++ b/tools/runBenchmark.cpp
@@ -0,0 +1,240 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */  
+/**
+ * @file runBenchmark.cpp
+ *
+ * @author Clemens-Alexander Brust (ikosa dot de at gmail dot com)
+ */
+
+#include <iostream>
+#include <fstream>
+#include <cstring>
+#include <sstream>
+#include <chrono>
+
+#include <cn24.h>
+
+std::string hardcoded_net = "# Sample CNN for LabelMeFacade Dataset \n\
+#anual rfx=34 rfy=34 factorx=4 factory=4 \n\
+ \n\
+# Network configuration \n\
+?convolutional kernels=16 size=7x7 \n\
+?maxpooling size=4x4 \n\
+?tanh \n\
+ \n\
+?convolutional kernels=12 size=5x5 \n\
+?maxpooling size=2x2 \n\
+?tanh \n\
+ \n\
+?convolutional kernels=96 size=5x5 \n\
+?tanh \n\
+ \n\
+?fullyconnected neurons=512 \n\
+?tanh \n\
+ \n\
+?fullyconnected neurons=192 \n\
+?tanh \n\
+ \n\
+?fullyconnected neurons=(o) \n\
+?output \n\
+ \n\
+# Learning settings \n\
+l1=0.000 \n\
+l2=0.0008 \n\
+lr=0.02 \n\
+gamma=0.003 \n\
+momentum=0.9 \n\
+exponent=0.75 \n\
+iterations=100 \n\
+sbatchsize=24 \n\
+pbatchsize=2 \n\
+mu=1.75 \n\
+eta=0.1 \n\
+";
+
+std::string hardcoded_net_ci = "# Sample CNN for LabelMeFacade Dataset \n\
+#anual rfx=34 rfy=34 factorx=4 factory=4 \n\
+ \n\
+# Network configuration \n\
+?convolutional kernels=8 size=7x7 \n\
+?maxpooling size=2x2 \n\
+ \n\
+?convolutional kernels=4 size=5x5 \n\
+?tanh \n\
+ \n\
+?convolutional kernels=12 size=5x5 \n\
+?tanh \n\
+ \n\
+?fullyconnected neurons=64 \n\
+?tanh \n\
+ \n\
+?fullyconnected neurons=(o) \n\
+?output \n\
+ \n\
+# Learning settings \n\
+l1=0.000 \n\
+l2=0.0008 \n\
+lr=0.02 \n\
+gamma=0.003 \n\
+momentum=0.9 \n\
+exponent=0.75 \n\
+iterations=100 \n\
+sbatchsize=24 \n\
+pbatchsize=2 \n\
+mu=1.75 \n\
+eta=0.1 \n\
+";
+
+int main (int argc, char* argv[]) {
+  // Initialize stat descriptor
+  Conv::StatDescriptor fps;
+  fps.nullable = false;
+  fps.description = "Throughput";
+  fps.unit = "frames/s";
+  fps.init_function = [] (Conv::Stat& stat) {
+    stat.is_null = false;
+    stat.value = 0.0;
+  };
+  fps.output_function = [] (Conv::HardcodedStats& hc_stats, Conv::Stat& stat) {
+    Conv::Stat return_stat = stat;
+    return_stat.value = stat.value / hc_stats.seconds_elapsed;
+    return return_stat;
+  };
+  fps.update_function = [] (Conv::Stat& stat, double user_value) {
+    stat.value += user_value;
+  };
+  
+  // Initialize CN24
+  Conv::System::Init();
+  
+  // Register ConsoleStatSink
+  Conv::ConsoleStatSink sink;
+  Conv::System::stat_aggregator->RegisterSink(&sink);
+  
+  // Register fps counter
+  Conv::System::stat_aggregator->RegisterStat(&fps);
+  
+  // Capture command line arguments
+  std::string net_config_fname;
+  if(argc > 1) {
+    net_config_fname = std::string(argv[1]);
+    LOGDEBUG << "Using user specified net: " << net_config_fname;
+  }
+  
+  // Set benchmark arguments
+  unsigned int CLASSES = 10;
+  unsigned int INPUTMAPS = 3;
+	unsigned int BENCHMARK_PASSES_FWD = 30;
+	unsigned int BENCHMARK_PASSES_BWD = 15;
+  unsigned int width = 512;
+  unsigned int height = 512;
+
+  std::istream* net_config_stream;
+  
+  if(argc > 1 && net_config_fname.compare(0,4,"--ci") != 0) {
+    // Open network and dataset configuration files
+    std::ifstream* net_config_file = new std::ifstream(net_config_fname,std::ios::in);
+    
+    if(!net_config_file->good()) {
+      FATAL("Cannot open net configuration file!");
+    }
+      net_config_stream = net_config_file;
+  } else {
+    if(net_config_fname.compare(0,4,"--ci") == 0) {
+      LOGINFO << "Using hardcoded net for continuous integration.";
+      std::stringstream* ss = new std::stringstream(hardcoded_net_ci);
+      net_config_stream = ss;
+      width = 64;
+      height = 64;
+    } else {
+      LOGINFO << "Using hardcoded net.";
+      std::stringstream* ss = new std::stringstream(hardcoded_net);
+      net_config_stream = ss;
+    }
+  }
+  
+  // Parse network configuration file
+  Conv::ConfigurableFactory* factory = new Conv::ConfigurableFactory(*net_config_stream, 238238, false);
+  
+
+  Conv::Tensor data_tensor(factory->optimal_settings().pbatchsize, width, height, INPUTMAPS);
+  data_tensor.Clear();
+
+  // Assemble net
+	Conv::NetGraph graph;
+  Conv::InputLayer input_layer(data_tensor);
+
+	Conv::NetGraphNode input_node(&input_layer);
+  input_node.is_input = true;
+
+	graph.AddNode(&input_node);
+	bool complete = factory->AddLayers(graph, Conv::NetGraphConnection(&input_node), CLASSES);
+	if (!complete)
+    FATAL("Failed completeness check, inspect model!");
+	factory->InitOptimalSettings();
+
+	LOGINFO << "Initializing net, this may take a while..." << std::flush;
+	graph.Initialize();
+  graph.SetIsTesting(true);
+  
+  // Initialize StatAggregator
+  Conv::System::stat_aggregator->Initialize();
+  
+  graph.FeedForward();
+	graph.BackPropagate();
+	
+	LOGINFO << "Benchmark information";
+	LOGINFO << "=====================";
+	
+	LOGINFO << "Input width    : " << width;
+	LOGINFO << "Input height   : " << height;
+	LOGINFO << "Parallel inputs: " << factory->optimal_settings().pbatchsize;
+	LOGINFO << "=====================";
+	
+	LOGINFO << "Running forward benchmark...\n" << std::flush;
+  Conv::System::stat_aggregator->StartRecording();
+	{
+		for(unsigned int p = 0; p < BENCHMARK_PASSES_FWD; p++) {
+			graph.FeedForward();
+      Conv::System::stat_aggregator->Update(fps.stat_id, (double)(factory->optimal_settings().pbatchsize));
+			std::cout << "." << std::flush;
+		}
+		std::cout << "\n";
+	}
+  Conv::System::stat_aggregator->StopRecording();
+  Conv::System::stat_aggregator->Generate();
+	Conv::System::stat_aggregator->Reset();
+  
+  graph.SetIsTesting(false);
+	LOGINFO << "Running forward+backward benchmark...\n" << std::flush;
+  Conv::System::stat_aggregator->StartRecording();
+	{
+		for(unsigned int p = 0; p < BENCHMARK_PASSES_BWD; p++) {
+			graph.FeedForward();
+			graph.BackPropagate();
+      Conv::System::stat_aggregator->Update(fps.stat_id, (double)(factory->optimal_settings().pbatchsize));
+			std::cout << "." << std::flush;
+		}
+		std::cout << "\n";
+	}
+  Conv::System::stat_aggregator->StopRecording();
+  Conv::System::stat_aggregator->Generate();
+  Conv::System::stat_aggregator->Reset();
+  
+  // Gradient check
+	LOGINFO << "Running sparse gradient check..." << std::flush;
+  auto start_time = std::chrono::system_clock::now();
+  Conv::GradientTester::TestGradient(graph, 999, true);
+  
+  auto stop_time = std::chrono::system_clock::now();
+  std::chrono::duration<double> t_diff = stop_time - start_time;
+  LOGRESULT << "Gradient check took " << t_diff.count() << " seconds." << LOGRESULTEND;
+  
+  LOGINFO << "DONE!";
+  LOGEND;
+  return 0;
+}
diff --git a/tools/tensorStreamStats.cpp b/tools/tensorStreamStats.cpp
new file mode 100644
index 0000000..b19f515
--- /dev/null
+++ b/tools/tensorStreamStats.cpp
@@ -0,0 +1,120 @@
+/*
+ * This file is part of the CN24 semantic segmentation software,
+ * copyright (C) 2015 Clemens-Alexander Brust (ikosa dot de at gmail dot com).
+ *
+ * For licensing information, see the LICENSE file included with this project.
+ */
+
+#include <fstream>
+#include <vector>
+#include <string>
+
+#include <cn24.h>
+
+int main(int argc, char* argv[]) {
+  if (argc < 2) {
+    LOGERROR << "USAGE: " << argv[0] << " <dataset configuration file>";
+    LOGEND;
+    return -1;
+  }
+
+  Conv::System::Init();
+
+
+  // Open tensor stream
+  std::string dataset_config_file(argv[1]);
+  
+  std::ifstream dataset_config_fstream(dataset_config_file, std::ios::in);
+  if(!dataset_config_fstream.good()) {
+    FATAL("Cannot open " << dataset_config_file << "!");
+  }
+  
+  Conv::Dataset* dataset = Conv::TensorStreamDataset::CreateFromConfiguration(dataset_config_fstream);
+
+  Conv::Tensor data_tensor(1, dataset->GetWidth(), dataset->GetHeight(), dataset->GetInputMaps());
+  Conv::Tensor weight_tensor(1, dataset->GetWidth(), dataset->GetHeight(), 1);
+  Conv::Tensor label_tensor(1, dataset->GetWidth(), dataset->GetHeight(), dataset->GetLabelMaps());
+  Conv::Tensor helper_tensor(1, dataset->GetWidth(), dataset->GetHeight(), 2);
+  
+  std::vector<Conv::datum> class_weights = dataset->GetClassWeights();
+  std::vector<std::string> class_names = dataset->GetClassNames();
+  
+  long double* pixel_counts = new long double[dataset->GetClasses()];
+  long double* pixel_counts_weighted = new long double[dataset->GetClasses()];
+  for(unsigned int clazz = 0; clazz < dataset->GetClasses(); clazz++) {
+    pixel_counts[clazz] = 0;
+    pixel_counts_weighted[clazz] = 0;
+  }
+  
+  for(unsigned int sample = 0; sample < dataset->GetTrainingSamples(); sample++) {
+    LOGINFO << "Processing sample " << sample+1 << "/" << dataset->GetTrainingSamples() << std::flush;
+    dataset->GetTrainingSample(data_tensor, label_tensor, helper_tensor, weight_tensor, 0, sample);
+    for(unsigned int y = 0; y < dataset->GetHeight(); y++) {
+      for(unsigned int x = 0; x < dataset->GetWidth(); x++) {
+        unsigned int pixel_class = label_tensor.PixelMaximum(x,y,0);
+        Conv::datum weight = *weight_tensor.data_ptr_const(x,y);
+        
+        pixel_counts[pixel_class] += (long double)weight;
+        pixel_counts_weighted[pixel_class] += (long double)(weight * class_weights[pixel_class]);
+      }
+    }
+  }
+  
+  long double total_pixels = 0;
+  long double total_pixels_weighted = 0;
+  long double total_classes = 0;
+  long double total_classes_weighted = 0;
+  for(unsigned int clazz = 0; clazz < dataset->GetClasses(); clazz++) {
+    total_pixels += pixel_counts[clazz];
+    total_pixels_weighted += pixel_counts_weighted[clazz];
+    if(pixel_counts[clazz] > 0)
+      total_classes++;
+    if(pixel_counts_weighted[clazz] > 0)
+      total_classes_weighted++;
+  }
+  long double expected_ratio = 1.0 / total_classes;
+  long double expected_ratio_weighted = 1.0 / total_classes_weighted;
+  long double correction_ratio_sum = 0;
+  long double correction_ratio_sum_weighted = 0;
+  for(unsigned int clazz = 0; clazz < dataset->GetClasses(); clazz++) {
+    if(pixel_counts[clazz] > 0)
+      correction_ratio_sum += expected_ratio/(pixel_counts[clazz]/total_pixels);
+    if(pixel_counts_weighted[clazz] > 0)
+      correction_ratio_sum_weighted += expected_ratio_weighted/(pixel_counts_weighted[clazz]/total_pixels_weighted);
+  }
+  
+  // Ignoring weights
+  LOGINFO << "Stats when ignoring weights";
+  LOGINFO << "===========================";
+  LOGINFO << "Classes counted: " << total_classes;
+  LOGINFO << "Expected ratio: " << 100.0 * expected_ratio << "%";
+  for(unsigned int clazz = 0; clazz < dataset->GetClasses(); clazz++) {
+    long double actual_ratio = pixel_counts[clazz]/total_pixels;
+    long double correction_ratio = 0;
+    if(pixel_counts[clazz] > 0) {
+      correction_ratio = expected_ratio / actual_ratio;
+    }
+    LOGINFO << "Class " << std::setw(30) << class_names[clazz] << " | " << std::setw(14) << static_cast<long>(pixel_counts[clazz]) << std::setw(14) << 100.0 * actual_ratio << "%" << std::setw(14) << correction_ratio << std::setw(14) << static_cast<long>(correction_ratio * pixel_counts[clazz]);
+  }
+  
+  // Not ignoring weights
+  LOGINFO << "Stats when not ignoring weights";
+  LOGINFO << "===========================";
+  LOGINFO << "Classes counted: " << total_classes_weighted;
+  LOGINFO << "Expected ratio: " << 100.0 * expected_ratio_weighted << "%";
+  for(unsigned int clazz = 0; clazz < dataset->GetClasses(); clazz++) {
+    long double actual_ratio = pixel_counts_weighted[clazz]/total_pixels_weighted;
+    long double correction_ratio = 0;
+    if(pixel_counts_weighted[clazz] > 0) {
+      correction_ratio = expected_ratio_weighted / actual_ratio;
+    }
+    LOGINFO << "Class " << std::setw(30) << class_names[clazz] << " | " << std::setw(14) << static_cast<long>(pixel_counts_weighted[clazz]) << std::setw(14) << 100.0 * actual_ratio << "%" << std::setw(14) << correction_ratio << std::setw(14) << static_cast<long>(correction_ratio * pixel_counts_weighted[clazz]);
+  }
+  
+
+  LOGINFO << "DONE!";
+  LOGEND;
+  
+  return 0;
+}
+
diff --git a/tools/testOpenCL.cpp b/tools/testOpenCL.cpp
index dea690e..0f1ba47 100644
--- a/tools/testOpenCL.cpp
+++ b/tools/testOpenCL.cpp
@@ -18,7 +18,8 @@
 
 #ifdef BUILD_OPENCL
 int main() {
-  
+  Conv::System::Init();
+
   cl_uint platform_count = 0;
   clGetPlatformIDs(0, 0, &platform_count);
   LOGINFO << "OpenCL platform(s) available: " << platform_count;
@@ -47,8 +48,6 @@ int main() {
     }
     
   }
-
-  Conv::System::Init();
   
   LOGEND;
   
diff --git a/tools/trainNetwork.cpp b/tools/trainNetwork.cpp
index d63bdd3..ec2cf5d 100644
--- a/tools/trainNetwork.cpp
+++ b/tools/trainNetwork.cpp
@@ -35,7 +35,7 @@ int main (int argc, char* argv[]) {
   const Conv::datum it_factor = 0.01;
 #else
   const Conv::datum it_factor = 1;
-  const Conv::datum loss_sampling_p = 0.25;
+  const Conv::datum loss_sampling_p = 0.5;
 #endif
   
   if(argc > 1) {
@@ -66,16 +66,28 @@ int main (int argc, char* argv[]) {
   std::string dataset_config_fname (argv[1]);
 
   Conv::System::Init(requested_log_level);
-
+  
+  // Register stat sinks
+  Conv::ConsoleStatSink console_stat_sink;
+  Conv::CSVStatSink csv_stat_sink;
+  Conv::System::stat_aggregator->RegisterSink(&console_stat_sink);
+  Conv::System::stat_aggregator->RegisterSink(&csv_stat_sink);
+  
+  Conv::Factory* factory;
+  
   // Open network and dataset configuration files
-  std::ifstream net_config_file (net_config_fname, std::ios::in);
-  std::ifstream dataset_config_file (dataset_config_fname, std::ios::in);
-
-  if (!net_config_file.good()) {
+  std::ifstream* net_config_file = new std::ifstream(net_config_fname, std::ios::in);
+  if (!net_config_file->good()) {
     FATAL ("Cannot open net configuration file!");
   }
 
   net_config_fname = net_config_fname.substr (net_config_fname.rfind ("/") + 1);
+  
+  // Parse network configuration file
+  factory = new Conv::ConfigurableFactory (*net_config_file, 8347734, true);
+
+  // Open dataset configuration file
+  std::ifstream dataset_config_file (dataset_config_fname, std::ios::in);
 
   if (!dataset_config_file.good()) {
     FATAL ("Cannot open dataset configuration file!");
@@ -83,8 +95,6 @@ int main (int argc, char* argv[]) {
 
   dataset_config_fname = dataset_config_fname.substr (net_config_fname.rfind ("/") + 1);
 
-  // Parse network configuration file
-  Conv::ConfigurableFactory* factory = new Conv::ConfigurableFactory (net_config_file, 8347734, true);
   factory->InitOptimalSettings();
   
   // Extract important settings from parsed configuration
@@ -185,23 +195,23 @@ int main (int argc, char* argv[]) {
 
       Conv::DatasetInputLayer* tdata_layer = nullptr;
       tdata_layer = new Conv::DatasetInputLayer (*testing_dataset, BATCHSIZE, 1.0, 983923);
-			Conv::NetGraphNode* tinput_node = new Conv::NetGraphNode(tdata_layer);
-			tinput_node->is_input = true;
-			testing_graph->AddNode(tinput_node);
+      Conv::NetGraphNode* tinput_node = new Conv::NetGraphNode(tdata_layer);
+      tinput_node->is_input = true;
+      testing_graph->AddNode(tinput_node);
 
-      Conv::ConfigurableFactory* tfactory = new Conv::ConfigurableFactory (net_config_file, 8347734);
-			bool testing_completeness = tfactory->AddLayers(*testing_graph, Conv::NetGraphConnection(tinput_node), CLASSES, true);
-			LOGDEBUG << "Testing graph complete: " << testing_completeness;
+      Conv::ConfigurableFactory* tfactory = new Conv::ConfigurableFactory (*net_config_file, 8347734);
+      bool testing_completeness = tfactory->AddLayers(*testing_graph, Conv::NetGraphConnection(tinput_node), CLASSES, true);
+      LOGDEBUG << "Testing graph complete: " << testing_completeness;
 
-			if(!completeness)
-				FATAL("Graph completeness test failed after factory run!");
+      if(!completeness)
+        FATAL("Graph completeness test failed after factory run!");
 
-			addStatLayers(*testing_graph, tinput_node, testing_dataset);
-			
-			if(!completeness)
-				FATAL("Graph completeness test failed after adding stat layer!");
+      addStatLayers(*testing_graph, tinput_node, testing_dataset);
+      
+      if(!completeness)
+        FATAL("Graph completeness test failed after adding stat layer!");
 
-			testing_graph->Initialize();
+      testing_graph->Initialize();
 
       // Shadow training net weights
       std::vector<Conv::CombinedTensor*> training_params;
@@ -225,6 +235,7 @@ int main (int argc, char* argv[]) {
       testing_trainer = &trainer;
     }
 
+    Conv::System::stat_aggregator->Initialize();
     LOGINFO << "Current training settings: " << factory->optimal_settings();
 
     if (FROM_SCRIPT) {
@@ -285,16 +296,27 @@ bool parseCommand (Conv::NetGraph& graph, Conv::NetGraph& testing_graph, Conv::T
   if (command.compare ("q") == 0 || command.compare ("quit") == 0) {
     return false;
   } else if (command.compare (0, 5, "train") == 0) {
+    Conv::System::stat_aggregator->StartRecording();
+    
     unsigned int epochs = 1;
     unsigned int layerview = 0;
+    unsigned int no_snapshots = 0;
     Conv::ParseCountIfPossible (command, "view", layerview);
     graph.SetLayerViewEnabled (layerview == 1);
     Conv::ParseCountIfPossible (command, "epochs", epochs);
-    trainer.Train (epochs);
+    Conv::ParseCountIfPossible(command, "no_snapshots", no_snapshots);
+    trainer.Train (epochs, no_snapshots != 1);
     testing_trainer.SetEpoch (trainer.epoch());
     graph.SetLayerViewEnabled (false);
     LOGINFO << "Training complete.";
+    
+    Conv::System::stat_aggregator->StopRecording();
+    if(no_snapshots == 1)
+      Conv::System::stat_aggregator->Generate();
+    Conv::System::stat_aggregator->Reset();
   } else if (command.compare (0, 4, "test") == 0) {
+    Conv::System::stat_aggregator->StartRecording();
+    
     unsigned int layerview = 0;
     Conv::ParseCountIfPossible (command, "view", layerview);
     testing_graph.SetLayerViewEnabled (layerview == 1);
@@ -302,6 +324,10 @@ bool parseCommand (Conv::NetGraph& graph, Conv::NetGraph& testing_graph, Conv::T
     testing_trainer.Test();
     testing_graph.SetLayerViewEnabled (false);
     LOGINFO << "Testing complete.";
+    
+    Conv::System::stat_aggregator->StopRecording();
+    Conv::System::stat_aggregator->Generate();
+    Conv::System::stat_aggregator->Reset();
   } else if (command.compare (0, 4, "load") == 0) {
     std::string param_file_name;
     unsigned int last_layer = 0;
@@ -356,6 +382,13 @@ bool parseCommand (Conv::NetGraph& graph, Conv::NetGraph& testing_graph, Conv::T
 
       param_file.close();
     }
+  } else if (command.compare (0, 14, "set experiment") == 0) {
+    std::string experiment_name = "";
+    Conv::ParseStringParamIfPossible(command, "name", experiment_name);
+    if(experiment_name.length() > 0)
+      Conv::System::stat_aggregator->SetCurrentExperiment(experiment_name);
+    else
+      LOGINFO << "Experiment name not specified, not changing!";
   } else if (command.compare (0, 9, "set epoch") == 0) {
     unsigned int epoch = 0;
     Conv::ParseCountIfPossible (command, "epoch", epoch);
@@ -405,8 +438,8 @@ bool parseCommand (Conv::NetGraph& graph, Conv::NetGraph& testing_graph, Conv::T
 		}
 	}
 	else if (command.compare(0, 5, "wstat") == 0) {
-    std::string node_uid = 0;
-		Conv::ParseStringIfPossible(command, "node", node_uid);
+    std::string node_uid;
+		Conv::ParseStringParamIfPossible(command, "node", node_uid);
 		for (Conv::NetGraphNode* node : graph.GetNodes()) {
 			if (node->unique_name.compare(node_uid) == 0) {
 				unsigned int p = 0;
@@ -421,11 +454,10 @@ bool parseCommand (Conv::NetGraph& graph, Conv::NetGraph& testing_graph, Conv::T
 		}
 	}
 	else if (command.compare(0, 5, "dstat") == 0) {
-    std::string node_uid = 0;
-		Conv::ParseStringIfPossible(command, "node", node_uid);
+    std::string node_uid;
+		Conv::ParseStringParamIfPossible(command, "node", node_uid);
 		for (Conv::NetGraphNode* node : graph.GetNodes()) {
 			if (node->unique_name.compare(node_uid) == 0) {
-				unsigned int p = 0;
 				for (Conv::NetGraphBuffer& output_buffer : node->output_buffers) {
 					Conv::CombinedTensor* output_tensor = output_buffer.combined_tensor;
 					LOGINFO << "Reporting stats on buffer " << output_buffer.description;
@@ -437,6 +469,13 @@ bool parseCommand (Conv::NetGraph& graph, Conv::NetGraph& testing_graph, Conv::T
 			}
 		}
 	}
+	else if (command.compare(0, 5, "tstat") == 0) {
+    unsigned int enable_tstat = 1;
+    Conv::ParseCountIfPossible(command, "enable", enable_tstat);
+    trainer.SetStatsDuringTraining(enable_tstat == 1);
+    testing_trainer.SetStatsDuringTraining(enable_tstat == 1);
+    LOGDEBUG << "Training stats enabled: " << enable_tstat;
+  }
 	else {
     LOGWARN << "Unknown command: " << command;
   }
@@ -447,12 +486,14 @@ bool parseCommand (Conv::NetGraph& graph, Conv::NetGraph& testing_graph, Conv::T
 void help() {
   std::cout << "You can use the following commands:\n";
   std::cout
-      << "  train [epochs=<n>]\n"
-      << "    Train the network for n epochs (default: 1)\n\n"
+      << "  train [epochs=<n>] [no_snapshots=1]\n"
+      << "    Train the network for n epochs (default: 1). no_snapshots=1 accumulates statistics over all n epochs.\n\n"
       << "  test\n"
       << "    Test the network\n\n"
       << "  set epoch=<epoch>\n"
       << "    Sets the current epoch\n\n"
+      << "  set experiment name=<name>\n"
+      << "    Sets the current experiment name for logging and statistics purposes\n\n"
       << "  reset\n"
       << "    Reinitializes the nets parameters\n\n"
       << "  load file=<path> [last_layer=<l>]\n"
@@ -460,5 +501,7 @@ void help() {
 			<< "  graph file=<path> {test|train}\n"
 			<< "    Write the network architecture for training/testing to a file in graphviz format\n\n"
       << "  save file=<path>\n"
-      << "    Save parameters to a file\n";
+      << "    Save parameters to a file\n\n"
+      << "  tstat enable=<1|0>\n"
+      << "    Enable statistics during training (1: yes, 0: no)\n";
 }