intel
diff --git a/‎Makefile
Lines changed: 7 additions & 1 deletion b/‎Makefile
Lines changed: 7 additions & 1 deletion
diff --git a/‎Makefile.config.example
Lines changed: 3 additions & 0 deletions b/‎Makefile.config.example
Lines changed: 3 additions & 0 deletions
diff --git a/‎Makefile.mkldnn
Lines changed: 4 additions & 4 deletions b/‎Makefile.mkldnn
Lines changed: 4 additions & 4 deletions
diff --git a/‎cmake/Dependencies.cmake
Lines changed: 1 addition & 1 deletion b/‎cmake/Dependencies.cmake
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/cpp_classification/batch_classification.cpp
Lines changed: 4 additions & 0 deletions b/‎examples/cpp_classification/batch_classification.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/cpp_classification/classification.cpp
Lines changed: 4 additions & 0 deletions b/‎examples/cpp_classification/classification.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/pycaffe/tune_model.py
Lines changed: 1 addition & 1 deletion b/‎examples/pycaffe/tune_model.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/caffe/blob.hpp
Lines changed: 3 additions & 3 deletions b/‎include/caffe/blob.hpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎include/caffe/layer.hpp
Lines changed: 9 additions & 4 deletions b/‎include/caffe/layer.hpp
Lines changed: 9 additions & 4 deletions
diff --git a/‎include/caffe/layers/batch_norm_layer.hpp
Lines changed: 8 additions & 0 deletions b/‎include/caffe/layers/batch_norm_layer.hpp
Lines changed: 8 additions & 0 deletions
@@ -80,7 +80,7 @@ ifeq ($(CAFFE_MLSL_SHUFFLE), 1)
 	COMMON_FLAGS += -DCAFFE_MLSL_SHUFFLE
 endif
 
-ifeq ($(FW_OVERLAP_OPT), 1)
+ifneq ($(FW_OVERLAP_OPT), 0)
 	COMMON_FLAGS += -DFW_OVERLAP_OPT
 endif
 endif
@@ -547,6 +547,12 @@ LIBRARY_DIRS += $(LIB_BUILD_DIR)
 # Automatic dependency generation (nvcc is handled separately)
 CXXFLAGS += -MMD -MP
 
+##########SGD FUSION#######################
+ifeq ($(ENABLE_SGD_FUSION), 1)
+        COMMON_FLAGS += -DENABLE_SGD_FUSION
+endif
+###########################################
+#
 # Complete build flags.
 COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
 CXXFLAGS += -std=c++11 -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS)
 
@@ -170,5 +170,8 @@ DISTRIBUTE_DIR := distribute
 # The ID of the GPU that 'make runtest' will use to run unit tests.
 TEST_GPUID := 0
 
+# Uncomment for enabling SGD fusion
+# ENABLE_SGD_FUSION := 1
+
 # enable pretty build (comment to see full commands)
 Q ?= @
@@ -1,5 +1,5 @@
 CAFFE_ROOTDIR := $(shell pwd)
-MKLDNN_ROOTDIR := external/mkldnn
+MKLDNN_ROOTDIR := $(CAFFE_ROOTDIR)/external/mkldnn
 MKLDNN_TMPDIR := $(MKLDNN_ROOTDIR)/tmp
 MKLDNN_SRCDIR := $(MKLDNN_ROOTDIR)/src
 MKLDNN_BUILDDIR := $(MKLDNN_ROOTDIR)/build
@@ -22,7 +22,7 @@ ifneq (,$(findstring ccache,$(CC)))
 endif
 
 MKLDNN_GITHUB := https://github.com/01org/mkl-dnn.git
-MKLDNN_CMAKE_FLAGS += $(MKLDNN_SRCDIR) -DCMAKE_INSTALL_PREFIX=$(CAFFE_ROOTDIR)/$(MKLDNN_INSTALLDIR) -DMKLROOT=${MKL_ROOTDIR} -B$(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) -DCMAKE_CXX_COMPILER="$(MKLDNN_CXX)" -DCMAKE_C_COMPILER="$(MKLDNN_CC)"
+MKLDNN_CMAKE_FLAGS += $(MKLDNN_SRCDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNN_INSTALLDIR) -DMKLROOT=${MKL_ROOTDIR} -B$(MKLDNN_BUILDDIR) -DCMAKE_CXX_COMPILER="$(MKLDNN_CXX)" -DCMAKE_C_COMPILER="$(MKLDNN_CC)"
 
 ifeq ("$(wildcard $(MKLDNN_INSTALLDIR)/include/mkldnn.hpp)", "")
 mkldnn_download:
@@ -32,8 +32,8 @@ mkldnn_download:
 
 mkldnn_build: mkldnn_download
 	cmake $(MKLDNN_CMAKE_FLAGS)
-	make -C $(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) -j$(shell cat /proc/cpuinfo |grep 'processor'|wc -l)
-	make -C $(CAFFE_ROOTDIR)/$(MKLDNN_BUILDDIR) install
+	make -C $(MKLDNN_BUILDDIR) -j$(shell cat /proc/cpuinfo |grep 'processor'|wc -l)
+	make -C $(MKLDNN_BUILDDIR) install
 else
 mkldnn_download:
 mkldnn_build:
 
@@ -122,7 +122,7 @@ if(USE_MLSL)
   if(CAFFE_MLSL_SHUFFLE)
     add_definitions("-DCAFFE_MLSL_SHUFFLE")
   endif()
-  if(FW_OVERLAP_OPT)
+  if(FW_OVERLAP_OPT OR NOT DEFINED FW_OVERLAP_OPT)
     message(STATUS "Forward overlapping optimization is enabled!")
     add_definitions("-DFW_OVERLAP_OPT")
   endif()
 
@@ -422,6 +422,10 @@ int main(int argc, char** argv) {
         cout<<"Use mean file: "<<FLAGS_mean_file<<endl;
     }
 
+#ifdef USE_MLSL
+    caffe::mn::init(&argc,&argv);
+#endif
+
     Classifier classifier(FLAGS_model, FLAGS_weights, FLAGS_mean_file,
             FLAGS_mean_value, FLAGS_label_file, FLAGS_engine, FLAGS_batch_size);
 
 
@@ -285,6 +285,10 @@ int main(int argc, char** argv) {
     engine = argv[6];
   }
 
+#ifdef USE_MLSL
+  caffe::mn::init(&argc,&argv);
+#endif
+
   Classifier classifier(model_file, trained_file, mean_file, label_file, engine);
 
 
 
@@ -23,7 +23,7 @@ def tuneModelDefinition(model_path, iteration):
     caffe_path = os.path.join(working_dir, "..", "..", "build", "tools", "caffe")
     if not os.path.exists(caffe_path):
         print "Caffe binary does not exist; please build Caffe binary first."
-        sys,exit(1)
+        sys.exit(1)
 
     base_model_name = os.path.basename(model_path)
     model_dir = os.path.dirname(model_path)
 
@@ -109,7 +109,7 @@ class Blob {
     return shape_[CanonicalAxisIndex(index)];
   }
   inline int num_axes() const { return shape_.size(); }
-  inline int count() const { return count_; }
+  inline long count() const { return count_; }
 
   /**
    * @brief Compute the volume of a slice; i.e., the product of dimensions
@@ -332,8 +332,8 @@ class Blob {
   shared_ptr<SyncedMemory> shape_data_;
 #endif
   vector<int> shape_;
-  int count_;
-  int capacity_;
+  long count_;
+  long capacity_;
 
   DISABLE_COPY_AND_ASSIGN(Blob);
 };  // class Blob
 
@@ -55,8 +55,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LOG_BLOB(layer, blob, part, blob_id, description)              \
   do                                                                   \
   {                                                                    \
-      int elems_to_log = std::min(MAX_ELEMS_TO_LOG, blob->count());    \
-      for (int idx = 0; idx < elems_to_log; idx++)                     \
+      long elems_to_log = std::min(static_cast<long>(MAX_ELEMS_TO_LOG), blob->count());    \
+      for (long idx = 0; idx < elems_to_log; idx++)                     \
       {                                                                \
           LOG_LAYER(layer) << description                              \
                            << ", blob_id " << blob_id                  \
@@ -68,8 +68,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LOG_PARAM_BLOB(blob, part, blob_id, description)               \
   do                                                                   \
   {                                                                    \
-      int elems_to_log = std::min(MAX_ELEMS_TO_LOG, blob->count());    \
-      for (int idx = 0; idx < elems_to_log; idx++)                     \
+      long elems_to_log = std::min(static_cast<long>(MAX_ELEMS_TO_LOG), blob->count());    \
+      for (long idx = 0; idx < elems_to_log; idx++)                     \
       {                                                                \
           DLOG(INFO) << description                                    \
                      << ", blob_id " << blob_id                        \
@@ -521,7 +521,12 @@ class Layer {
       CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be "
           "unspecified or specified once per top blob.";
       for (int top_id = 0; top_id < top.size(); ++top_id) {
+#ifdef USE_MLSL
+        const Dtype loss_weight = layer_param_.loss_weight(top_id) /
+          GetDistribution().get_data_parts();
+#else
         const Dtype loss_weight = layer_param_.loss_weight(top_id);
+#endif
         if (loss_weight == Dtype(0)) { continue; }
         this->set_loss(top_id, loss_weight);
         const int count = top[top_id]->count();
 
@@ -117,11 +117,19 @@ class BatchNormLayer : public Layer<Dtype> {
                        const Dtype* data_to_be_replicated,
                        FuncTy op_func);
 
+  void ForwardStatsBatch_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top, int stats_batch_idx);
+  void BackwardStatsBatch_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom,
+      int stats_batch_idx);
+
   Blob<Dtype> mean_, variance_, temp_, x_norm_;
   bool use_global_stats_;
   Dtype moving_average_fraction_;
   int channels_;
   Dtype eps_;
+  int num_stats_batches_;
+  int stats_batch_size_;
 
   // extra temporarary variables is used to carry out sums/broadcasting
   // using BLAS
Original file line number	Diff line number	Diff line change
`@@ -422,6 +422,10 @@ int main(int argc, char** argv) {`
`422`	`422`	`cout<<"Use mean file: "<<FLAGS_mean_file<<endl;`
`423`	`423`	`}`
`424`	`424`
	`425`	`+#ifdef USE_MLSL`
	`426`	`+ caffe::mn::init(&argc,&argv);`
	`427`	`+#endif`
	`428`	`+`
`425`	`429`	`Classifier classifier(FLAGS_model, FLAGS_weights, FLAGS_mean_file,`
`426`	`430`	`FLAGS_mean_value, FLAGS_label_file, FLAGS_engine, FLAGS_batch_size);`
`427`	`431`
Original file line number	Diff line number	Diff line change
`@@ -285,6 +285,10 @@ int main(int argc, char** argv) {`
`285`	`285`	`engine = argv[6];`
`286`	`286`	`}`
`287`	`287`
	`288`	`+#ifdef USE_MLSL`
	`289`	`+ caffe::mn::init(&argc,&argv);`
	`290`	`+#endif`
	`291`	`+`
`288`	`292`	`Classifier classifier(model_file, trained_file, mean_file, label_file, engine);`
`289`	`293`
`290`	`294`