Initial commit

2024-12-10 08:56:11 -08:00 · 2024-12-10 08:56:11 -08:00 · 9fdd561586
commit 9fdd561586
246 changed files with 58283 additions and 0 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1 @@
 External contributions are not accepted, sorry!
--- a/9
+++ b/9
@ -0,0 +1,9 @@
 Copyright 2023 Google LLC.
 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/28
+++ b/28
@ -0,0 +1,28 @@
 MODELNAME := _ConvNet
 INCLUDES :=  -I$(PYTHON_INCLUDE_PATH) -I$(NUMPY_INCLUDE_PATH) -I./include -I./include/common -I./include/cudaconv2 -I./include/nvmatrix
 LIB := -lpthread -L$(ATLAS_LIB_PATH) -L$(CUDA_INSTALL_PATH)/lib64 -lcblas
 USECUBLAS   := 1
 PYTHON_VERSION=$(shell python -V 2>&1 | cut -d ' ' -f 2 | cut -d '.' -f 1,2)
 LIB += -lpython$(PYTHON_VERSION)
 GENCODE_ARCH := -gencode=arch=compute_20,code=\"sm_20,compute_20\"
 COMMONFLAGS := -DNUMPY_INTERFACE -DMODELNAME=$(MODELNAME) -DINITNAME=init$(MODELNAME)
 EXECUTABLE	:= $(MODELNAME).so
 CUFILES				:= $(shell echo src/*.cu src/cudaconv2/*.cu src/nvmatrix/*.cu)
 CU_DEPS				:= $(shell echo include/*.cuh include/cudaconv2/*.cuh include/nvmatrix/*.cuh)
 CCFILES				:= $(shell echo src/common/*.cpp)
 C_DEPS				:= $(shell echo include/common/*.h)
 include common-gcc-cuda-4.0.mk
 makedirectories:
 	$(VERBOSE)mkdir -p $(LIBDIR)
 	$(VERBOSE)mkdir -p $(OBJDIR)/src/cudaconv2
 	$(VERBOSE)mkdir -p $(OBJDIR)/src/nvmatrix
 	$(VERBOSE)mkdir -p $(OBJDIR)/src/common
 	$(VERBOSE)mkdir -p $(TARGETDIR)
--- a/README.md
+++ b/README.md
@ -0,0 +1,9 @@
 # AlexNet
 This package contains the original AlexNet code.
 Krizhevsky, A., Sutskever, I. & Hinton, G. E. (2012).
 ImageNet Classification with Deep Convolutional Neural Networks.
 In F. Pereira, C. J. C. Burges, L. Bottou & K. Q. Weinberger (ed.),
 Advances in Neural Information Processing Systems 25 (pp. 1097--1105).
 Curran Associates, Inc. .
--- a/SdkMasterLog.csv
+++ b/SdkMasterLog.csv
@ -0,0 +1,2 @@
 deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 4.2, CUDA Runtime Version = 4.2, NumDevs = 4, Device = Tesla S2050, Device = Tesla S2050
 deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 4.2, CUDA Runtime Version = 4.2, NumDevs = 4, Device = Tesla S2050, Device = Tesla S2050
--- a/avg-test.py
+++ b/avg-test.py
@ -0,0 +1,23 @@
 from util import *
 import os
 import sys
 import re
 import random as r
 import os
 def do_avg(paths, tgtpath, coeffs):
    for i,f in enumerate(sorted(os.listdir(paths[0]))):
        b = int(re.match('test_preds_(\d+)', f).group(1))
        dics = [unpickle(os.path.join(p, f)) for p in paths]
        preds = sum(c * d['data'] for c,d in zip(coeffs, dics))
        pickle(os.path.join(tgtpath, 'test_preds_%d' % b), {'data': preds})
        print "Wrote batch %d" % b
 if __name__ == "__main__":
    paths = sys.argv[1].split(',')
    tgtpath = sys.argv[2]
    if not os.path.exists(tgtpath):
        os.makedirs(tgtpath)
    coeffs = [float(x) for x in sys.argv[3].split(',')]
    do_avg(paths, tgtpath, coeffs)
--- a/avg-valid.py
+++ b/avg-valid.py
@ -0,0 +1,108 @@
 from util import *
 import os
 import sys
 import re
 import random as r
 import numpy.random as nr
 from math import sqrt
 #VALID_PATH = '/ais/gobi3/u/kriz/lsvrc-2012'
 VALID_PATH = '/storage/lsvrc-2012'
 def compute_top5(preds, labels):
    errs = 0
    for c in xrange(preds.shape[0]):
        err = True
        for i in xrange(5):
            top = preds[c,:].argmax()
            if top == labels[c]:
                err = False
                break
            preds[c, top] = -1
        errs += err
    return errs
    #top5 = [[k[0] for k in sorted(zip(xrange(preds.shape[1]), preds[c,:]), key=lambda x:x[1], reverse=True)[:5]] for c in xrange(preds.shape[0])]
    #return sum(l not in t for l,t in zip(labels, top5))
 def do_avg(paths, coeffs, top5=False):
    #coeffs = [float(x) for x in sys.argv[2].split(',')]
    off = unpickle(os.path.join(VALID_PATH, 'batches.meta'))['label_offset']
    errs1, errs5, cases = 0, 0, 0
    for i,f in enumerate(sorted(os.listdir(paths[0]))):
        b = int(re.match('test_preds_(\d+)', f).group(1))
        dics = [unpickle(os.path.join(p, f)) for p in paths]
        dicv = unpickle(os.path.join(VALID_PATH, 'data_batch_%d' % b))
        labels = n.array([d[1]+off for d in dicv[2]])
        assert labels.min >= 0 and labels.max() < 1000 
        preds = sum(c * d['data'] for c,d in zip(coeffs, dics))
        assert preds.shape[1] == 1000
        err1 = sum(preds.argmax(1) != labels)
        err5 = compute_top5(preds, labels) if top5 else 0
        errs1 += err1
        errs5 += err5
        cases += preds.shape[0]
        #print "%.4f %.4f" % (float(err1) / preds.shape[0], float(err5) / preds.shape[0])
    return errs1 / float(cases), errs5 / float(cases)
    #print "Average error rate with coeffs %s: %.4f %.4f" % (", ".join("%.2f" % f for f in coeffs), errs1 / float(cases), errs5 / float(cases))
 def find_coeffs(paths, passes=5, cmin=0.0, cmax=1.0, step=0.05):
    coeffs = [(cmax-cmin)/2 for i in xrange(len(paths))]
    #coeffs = [cmin + (r.random() * (cmax-cmin)) for i in xrange(len(paths))]
    best1 = do_avg(paths, coeffs, top5=True)[1]
    changed = -1
    for p in xrange(passes):
        print "Pass %d" % p
        for i in xrange(len(coeffs)):
            if changed == i:
                changed = -2
                break
            for c in [cmin + c * step for c in xrange(1+int((cmax-cmin)/step))]:
                oldc = coeffs[i]
                coeffs[i] = c
                err = do_avg(paths, coeffs, top5=True)[1]
                if err < best1:
                    best1 = err
                    changed = i
                else:
                    coeffs[i] = oldc
            print "Best error rate: %.4f, coeffs: [%s]" % (best1, ",".join("%.2f" % f for f in coeffs))
        if changed == -2:
            break
 def find_coeffs2(paths, passes=50):
    #coeffs = n.array([r.random() for i in xrange(len(paths))])
    coeffs = n.array([0.5 for i in xrange(len(paths))])
    coeffs /= coeffs.sum()
    #crange = [[cmin + c * step for c in xrange(1+int((cmax-cmin)/step))] for i in xrange(len(paths))]
    for p in xrange(passes):
        print "Pass %d" % p
        for i in nr.permutation(range(coeffs.shape[0])):
            #bigger = r.randint(0,2) == 0
            #c = coeffs[i] + r.random() * (1 - coeffs[i]) if bigger else r.random() * coeffs[i]
            c = min(1, max(0, coeffs[i] + nr.randn() / (2*sqrt(1+p))))
            oldc = coeffs[i]
            coeffs[i] = c
            err = do_avg(paths, coeffs, top5=True)[1]
            changed = ""
            if err < best1:
                best1 = err
                changed = "*"
                #crange = [[cmin + x * step for x in xrange(1+int((cmax-cmin)/step))] for i in xrange(len(paths))]
            else:
                coeffs[i] = oldc
            coeffs /= coeffs.sum()
            #crange[i].remove(c)
            print "Best error rate: %.4f, coeffs: [%s]%s" % (best1, ",".join("%.4f" % f for f in coeffs), changed)
 if __name__ == "__main__":
    paths = sys.argv[1].split(',')
    if len(sys.argv) == 2:
        find_coeffs(paths)
    else:
        coeffs = n.array([float(x) for x in sys.argv[2].split(',')])
        errs = do_avg(paths, coeffs, top5=True)
        print "Average error rate with coeffs %s: %.4f %.4f" % (", ".join("%.2f" % f for f in coeffs), errs[0], errs[1])
--- a/build.sh
+++ b/build.sh
@ -0,0 +1,19 @@
 #!/bin/sh
 NVMATRIX=$NVMATRIX_K20X_INCLUDE/..
 CUDACONV=$NVCONV2_K20X_INCLUDE/..
 rm -rf build
 rm *.so
 mkdir -p build
 cp -r src build/
 cp -r include build/
 cp $NVMATRIX/src/nvmatrix.cu $NVMATRIX/src/nvmatrix_kernels.cu $NVMATRIX/src/gpu_locking.cpp build/src
 cp $NVMATRIX/include/nvmatrix.cuh $NVMATRIX/include/nvmatrix_kernels.cuh $NVMATRIX/include/nvmatrix_operators.cuh $NVMATRIX/include/gpu_locking.h build/include
 cp $CUDACONV/src/conv_util.cu $CUDACONV/src/filter_acts.cu $CUDACONV/src/weight_acts.cu $CUDACONV/src/img_acts.cu build/src
 cp $CUDACONV/include/conv_util.cuh $CUDACONV/include/cudaconv2.cuh build/include
 cp Makefile-all build/Makefile
 cd build && make -j kepler=1 $* && cd ..
 ln -fs build/*.so ./
--- a/convdata.py
+++ b/convdata.py
@ -0,0 +1,336 @@
 from data import *
 import numpy.random as nr
 import numpy as n
 import random as r
 from time import time
 from threading import Thread
 from math import sqrt
 import sys
 from pylab import *
 class FlatMemoryDataProvider(LabeledMemoryDataProvider):
    def __init__(self, data_dir, batch_range, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
        LabeledMemoryDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
        self.data_mean = self.batch_meta['data_mean'].reshape((self.batch_meta['data_mean'].shape[0], 1))
        # Subtract the mean from the data and make sure that both data and
        # labels are in single-precision floating point.
        for d in self.data_dic:
            # This converts the data matrix to single precision and makes sure that it is C-ordered
            d['data'] = n.require((d['data'] - self.data_mean), dtype=n.single, requirements='C')
            d['labels'] = d['labels'].astype(n.int)
            d['labelprobs'] = n.zeros((self.get_num_classes(), d['data'].shape[1]), dtype=n.single)
            for c in xrange(d['data'].shape[1]):
                d['labelprobs'][d['labels'][c],c] = 1.0
    def get_next_batch(self):
        epoch, batchnum, datadic = LabeledMemoryDataProvider.get_next_batch(self)
        return epoch, batchnum, [datadic['data'], datadic['labelprobs']]
    def get_data_dims(self, idx=0):
        return self.batch_meta['num_vis'] if idx == 0 else self.get_num_classes()
 class ImageNetDP(LabeledDataProvider):
    MAX_PCA_COMPONENTS = 1024 # Use this many components for noise generation
    def __init__(self, data_dir, batch_range, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
        LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
        self.init_commons(data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
    def init_commons(self, data_dir, batch_range, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
        self.data_mean = self.batch_meta['data_mean'].astype(n.single)
        self.color_eig = self.batch_meta['color_pca'][1].astype(n.single)
        self.color_stdevs = n.c_[self.batch_meta['color_pca'][0].astype(n.single)]
        self.color_noise_coeff = dp_params['color_noise']
        self.pca_noise_coeff = dp_params['pca_noise']
        self.num_colors = 3
        self.img_size = int(sqrt(self.batch_meta['num_vis'] / self.num_colors))
    def get_labels(self, datadic):
        pass
    def showimg(self, img):
        pixels = img.shape[0] / 3
        size = int(sqrt(pixels))
        img = img.reshape((3,size,size)).swapaxes(0,2).swapaxes(0,1)
        imshow(img, interpolation='nearest')
        show()
    def get_next_batch(self):
        epoch, batchnum, datadic = LabeledDataProvider.get_next_batch(self)
        # This takes about 1 sec per batch :(
        # If I don't convert both to single ahead of time, it takes even longer.
        data = n.require(datadic['data'] - self.data_mean, dtype=n.single, requirements='C')
        labels = self.get_labels(datadic)
 #        wordvecs = datadic['wordvecs']
        wordpres = datadic['wordpres']
        # Labels have to be in the range 0-(number of classes - 1)
        assert labels.max() < self.get_num_classes(), "Invalid labels!"
        assert labels.min() == 0, "Invalid labels!"
        return epoch, batchnum, [data, labels, wordpres]
    # Takes as input an array returned by get_next_batch
    # Returns a (numCases, imgSize, imgSize, 3) array which can be
    # fed to pylab for plotting.
    # This is used by shownet.py to plot test case predictions.
    def get_plottable_data(self, data, add_mean=True):
        return n.require((data + (self.data_mean if add_mean else 0)).T.reshape(data.shape[1], 3, self.img_size, self.img_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
 class ImageNetLogRegDP(ImageNetDP):
    def __init__(self, data_dir, batch_range, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
        ImageNetDP.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
    def get_labels(self, datadic):
        return n.array(datadic['labels'], dtype=n.single).reshape((1, datadic['data'].shape[1]))
    def get_data_dims(self, idx=0):
        if idx == 0:
            return self.img_size**2 * self.num_colors
        if idx == 2:
            return 100
        return 1
 class BatchLoaderThread(Thread):
    def __init__(self, data_dir, path, list_out):
        Thread.__init__(self)
        self.data_dir = data_dir
        self.path = path
        self.list_out = list_out
        #print "loading %d" % self.bnum
    def run(self):
        self.list_out.append(unpickle(self.path))
 class ColorNoiseMakerThread(Thread):
    def __init__(self, pca_stdevs, pca_vecs, num_noise, list_out):
        Thread.__init__(self)
        self.pca_stdevs, self.pca_vecs = pca_stdevs, pca_vecs
        self.num_noise = num_noise
        self.list_out = list_out
    def run(self):
        noise = n.dot(self.pca_vecs, nr.randn(3, self.num_noise).astype(n.single) * self.pca_stdevs)
        self.list_out.append(noise)
 class CroppedImageNetDP(ImageNetDP):
    def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
        ImageNetDP.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
        self.border_size = dp_params['crop_border']
        self.inner_size = self.img_size - self.border_size*2
        self.multiview = dp_params['multiview_test'] and test
        self.num_views = 5*2
        self.data_mult = self.num_views if self.multiview else 1
        self.crop_chunk = 32 # This many images will be cropped in the same way
        # Maintain poitners to previously-returned data matrices so they don't get garbage collected.
        # I've never seen this happen but it's a safety measure.
        self.data = [None, None]
        self.cropped_data = [n.zeros((self.get_data_dims(), 0*self.data_mult), dtype=n.single) for x in xrange(2)]
        self.loader_thread, self.color_noise_thread = None, None
        self.convnet = dp_params['convnet']
        self.num_noise = 1024
        self.batches_generated = 0
        self.data_mean_crop = self.data_mean.reshape((3,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((3*self.inner_size**2, 1))
    def get_data_dims(self, idx=0):
        if idx == 0:
            return self.inner_size**2 * 3
        return 1
    def start_color_noise_maker(self):
        color_noise_list = []
        self.color_noise_thread = ColorNoiseMakerThread(self.color_stdevs, self.color_eig, self.num_noise, color_noise_list)
        self.color_noise_thread.start()
        return color_noise_list
    def get_labels(self, datadic):
        pass
    def start_loader(self, batch_idx):
        self.load_data = []
        self.loader_thread = BatchLoaderThread(self.data_dir, self.get_data_file_name(self.batch_range[batch_idx]), self.load_data)
        self.loader_thread.start()
    def get_next_batch(self):
        self.d_idx = self.batches_generated % 2
        if self.test:
            epoch, batchnum, self.data[self.d_idx] = LabeledDataProvider.get_next_batch(self)
        else:
            epoch, batchnum = self.curr_epoch, self.curr_batchnum
            if self.loader_thread is None:
                self.start_loader(self.batch_idx)
                self.loader_thread.join()
                self.data[self.d_idx] = self.load_data[0] 
                self.start_loader(self.get_next_batch_idx())
            else:
                # Set the argument to join to 0 to re-enable batch reuse
                self.loader_thread.join()
                if not self.loader_thread.is_alive():
                    self.data[self.d_idx] = self.load_data[0]
                    self.start_loader(self.get_next_batch_idx())
            self.advance_batch()
        cropped = self.get_cropped_data(self.data[self.d_idx])
        if self.color_noise_coeff > 0 and not self.test:
            # At this point the data already has 0 mean.
            # So I'm going to add noise to it, but I'm also going to scale down
            # the original data. This is so that the overall scale of the training
            # data doesn't become too different from the test data.
            s = cropped.shape
            cropped_size = self.get_data_dims(0) / 3
            ncases = s[1]
            if self.color_noise_thread is None:
                self.color_noise_list = self.start_color_noise_maker()
                self.color_noise_thread.join()
                self.color_noise = self.color_noise_list[0]
                self.color_noise_list = self.start_color_noise_maker()
            else:
                self.color_noise_thread.join(0)
                if not self.color_noise_thread.is_alive():
                    self.color_noise = self.color_noise_list[0]
                    self.color_noise_list = self.start_color_noise_maker()
 #                    print "Generated new noise"
 #                else:
 #                    print "Reusing old noise"
                # If the noise thread IS alive, then we'll just re-use the noise from the last run
            cropped = self.cropped_data[self.d_idx] = cropped.reshape((3, cropped_size, ncases)).swapaxes(0,1).reshape((cropped_size, ncases*3))
            self.color_noise = self.color_noise[:,:ncases].reshape((1, 3*ncases))
            cropped += self.color_noise * self.color_noise_coeff
            cropped = self.cropped_data[self.d_idx] = cropped.reshape((cropped_size, 3, ncases)).swapaxes(0,1).reshape(s)
            cropped /= 1.0 + self.color_noise_coeff
 #        cropped -= cropped.min()
 #        cropped /= cropped.max()
 #        self.showimg(cropped[:,0])
        self.data[self.d_idx]['labels'] = self.get_labels(self.data[self.d_idx])
        self.data[self.d_idx]['data'] = cropped
        self.batches_generated += 1
        return epoch, batchnum, [self.data[self.d_idx]['data'], self.data[self.d_idx]['labels']]
    def get_cropped_data(self, data):
        cropped = self.cropped_data[self.d_idx]
        if cropped.shape[1] != data['data'].shape[1] * self.data_mult:
            cropped = self.cropped_data[self.d_idx] = n.zeros((cropped.shape[0], data['data'].shape[1] * self.data_mult), dtype=n.single)
        self.__trim_borders(data['data'], cropped)
        return self.subtract_mean(cropped)
    def subtract_mean(self,data):
        data -= self.data_mean_crop
        return data
    # Takes as input an array returned by get_next_batch
    # Returns a (numCases, imgSize, imgSize, 3) array which can be
    # fed to pylab for plotting.
    # This is used by shownet.py to plot test case predictions.
    def get_plottable_data(self, data, add_mean=True):
        return n.require((data + (self.data_mean_crop if add_mean else 0)).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
    def __trim_borders(self, x, target):
        y = x.reshape(3, self.img_size, self.img_size, x.shape[1])
        if self.test: # don't need to loop over cases
            if self.multiview:
                start_positions = [(0,0),  (0, self.border_size*2),
                                   (self.border_size, self.border_size),
                                  (self.border_size*2, 0), (self.border_size*2, self.border_size*2)]
                end_positions = [(sy+self.inner_size, sx+self.inner_size) for (sy,sx) in start_positions]
                for i in xrange(self.num_views/2):
                    pic = y[:,start_positions[i][0]:end_positions[i][0],start_positions[i][1]:end_positions[i][1],:]
                    target[:,i * x.shape[1]:(i+1)* x.shape[1]] = pic.reshape((self.get_data_dims(),x.shape[1]))
                    target[:,(self.num_views/2 + i) * x.shape[1]:(self.num_views/2 +i+1)* x.shape[1]] = pic[:,:,::-1,:].reshape((self.get_data_dims(),x.shape[1]))
            else:
                pic = y[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size, :] # just take the center for now
                target[:,:] = pic.reshape((self.get_data_dims(), x.shape[1]))
        else:
            for c in xrange(0, x.shape[1], self.crop_chunk): # loop over cases in chunks
                startY, startX = nr.randint(0,self.border_size*2 + 1), nr.randint(0,self.border_size*2 + 1)
                endY, endX = startY + self.inner_size, startX + self.inner_size
                c_end = min(c + self.crop_chunk, x.shape[1])
                pic = y[:,startY:endY,startX:endX, c:c_end]
                if nr.randint(2) == 0: # also flip the images with 50% probability
                    pic = pic[:,:,::-1,:]
                target[:,c:c_end] = pic.reshape((self.get_data_dims(),c_end-c))
            #target[:] = n.require(target[:,nr.permutation(x.shape[1])], requirements='C')
 class CroppedImageNetLogRegDP(CroppedImageNetDP):
    def __init__(self, data_dir, batch_range, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
        CroppedImageNetDP.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
    def get_labels(self, datadic):
        return n.require(n.tile(n.array(datadic['labels'], dtype=n.single).reshape((1, datadic['data'].shape[1])), (1, self.data_mult)), requirements='C')
 class RandomScaleImageNetLogRegDP(CroppedImageNetLogRegDP):
    def __init__(self, data_dir, batch_range, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
        CroppedImageNetLogRegDP.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
        del self.cropped_data
        self.data_mean_mean = self.data_mean.mean()
    def get_cropped_data(self):
        if self.test and self.multiview:
            x = self.data['data']
            y = x.reshape(3, self.img_size, self.img_size, x.shape[1])
            target = n.zeros((self.inner_size**2*3, self.data['data'].shape[1]*self.num_views), dtype=n.uint8)
            start_positions = [(0,0), (0, self.border_size), (0, self.border_size*2),
                               (self.border_size, 0), (self.border_size, self.border_size), (self.border_size, self.border_size*2),
                              (self.border_size*2, 0), (self.border_size*2, self.border_size), (self.border_size*2, self.border_size*2)]
            end_positions = [(sy+self.inner_size, sx+self.inner_size) for (sy,sx) in start_positions]
            for i in xrange(self.num_views):
                target[:,i * x.shape[1]:(i+1)* x.shape[1]] = y[:,start_positions[i][0]:end_positions[i][0],start_positions[i][1]:end_positions[i][1],:].reshape((self.inner_size**2*3,x.shape[1]))
            return self.subtract_mean(target)
        elif not self.test:
            # it should be ok to flip it into the same matrix
            # since if it ends up being reused, flips are invertible.
            self.reflect_data(self.data['data'], self.data['data'])
        return self.subtract_mean(self.data['data'])
    def reflect_data(self, x, target):
        y = x.reshape(3, self.img_size, self.img_size, x.shape[1])
        for c in xrange(0, x.shape[1], self.crop_chunk): # loop over cases in chunks
            c_end = min(c + self.crop_chunk, x.shape[1])
            pic = y[:,:,:, c:c_end]
            if nr.randint(2) == 0: # flip the images with 50% probability
                pic = pic[:,:,::-1,:]
            target[:,c:c_end] = pic.reshape((self.get_data_dims(),c_end-c))
    # Note that this variant subtracts the same scalar from each pixel
    def subtract_mean(self, data):
        return n.require(data - self.data_mean_mean, dtype=n.single, requirements='C') 
    def get_data_dims(self, idx=0):
        return self.img_size**2 * 3 if idx == 0 else 1
 class DummyConvNetLogRegDP(LabeledDummyDataProvider):
    def __init__(self, data_dim):
        LabeledDummyDataProvider.__init__(self, data_dim)
        self.batch_meta['tree'] = dict([(i, []) for i in xrange(self.num_classes)])
        self.batch_meta['tree'][10] = [0, 1, 2]
        self.batch_meta['tree'][11] = [3, 4, 5]
        self.batch_meta['tree'][12] = [6, 7]
        self.batch_meta['tree'][13] = [8, 9]
        self.batch_meta['tree'][14] = [10, 11]
        self.batch_meta['tree'][15] = [12, 13]
        self.batch_meta['tree'][16] = [14, 15]
        self.batch_meta['all_wnids'] = {'gproot': 16}
        self.img_size = int(sqrt(data_dim/3))
    def get_next_batch(self):
        epoch, batchnum, dic = LabeledDummyDataProvider.get_next_batch(self)
        dic['data'] = n.require(dic['data'].T, requirements='C')
        dic['labels'] = n.require(dic['labels'].T, requirements='C')
        dic['gates'] = nr.rand(1, dic['data'].shape[1]).astype(n.single)
        return epoch, batchnum, [dic['data'], dic['labels'], dic['gates']]
    # Returns the dimensionality of the two data matrices returned by get_next_batch
    def get_data_dims(self, idx=0):
        return self.batch_meta['num_vis'] if idx == 0 else 1
--- a/convdata_cifar.py
+++ b/convdata_cifar.py
@ -0,0 +1,115 @@
 from data import *
 import numpy.random as nr
 import numpy as n
 import random as r
 class CIFARDataProvider(LabeledDataProvider):
    def __init__(self, data_dir, batch_range, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
        LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
        self.data_mean = self.batch_meta['data_mean']
        self.num_colors = 3
        self.img_size = 32
        self.data_dims = [self.img_size**2 * self.num_colors, 1, self.get_num_classes()]
    def get_next_batch(self):
        epoch, batchnum, datadic = LabeledDataProvider.get_next_batch(self)
        if 'processed' not in datadic:
            datadic['data'] = n.require((datadic['data'] - self.data_mean), dtype=n.single, requirements='C')
            datadic['labelsVec'] = n.require(n.array(datadic['labels']).reshape((1, datadic['data'].shape[1])), requirements='C', dtype=n.single)
            datadic['labelsMat'] = n.zeros((self.get_num_classes(), datadic['data'].shape[1]), dtype=n.single)
            datadic['labelsMat'][datadic['labels'],n.c_[0:datadic['data'].shape[1]]] = 1
            datadic['processed'] = True
        return epoch, batchnum, [datadic['data'], datadic['labelsVec'], datadic['labelsMat']]
    # Returns the dimensionality of the two data matrices returned by get_next_batch
    # idx is the index of the matrix. 
    def get_data_dims(self, idx=0):
        return self.data_dims[idx]
    # Takes as input an array returned by get_next_batch
    # Returns a (numCases, imgSize, imgSize, 3) array which can be
    # fed to pylab for plotting.
    # This is used by shownet.py to plot test case predictions.
    def get_plottable_data(self, data):
        return n.require((data + self.data_mean).T.reshape(data.shape[1], 3, self.img_size, self.img_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
 class CroppedCIFARDataProvider(LabeledMemoryDataProvider):
    def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False):
        LabeledMemoryDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
        self.border_size = dp_params['crop_border']
        self.inner_size = 32 - self.border_size*2
        self.multiview = dp_params['multiview_test'] and test
        self.num_views = 9
        self.data_mult = self.num_views if self.multiview else 1
        self.num_colors = 3
        for d in self.data_dic:
            d['data'] = n.require(d['data'], requirements='C')
            d['labels'] = n.require(n.tile(d['labels'].reshape((1, d['data'].shape[1])), (1, self.data_mult)), requirements='C')
        self.cropped_data = [n.zeros((self.get_data_dims(), self.data_dic[0]['data'].shape[1]*self.data_mult), dtype=n.single) for x in xrange(2)]
        self.batches_generated = 0
        self.data_mean = self.batch_meta['data_mean'].reshape((3,32,32))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((self.get_data_dims(), 1))
    def get_next_batch(self):
        epoch, batchnum, datadic = LabeledMemoryDataProvider.get_next_batch(self)
        cropped = self.cropped_data[self.batches_generated % 2]
        self.__trim_borders(datadic['data'], cropped)
        cropped -= self.data_mean
        self.batches_generated += 1
        return epoch, batchnum, [cropped, datadic['labels']]
    def get_data_dims(self, idx=0):
        return self.inner_size**2 * 3 if idx == 0 else 1
    # Takes as input an array returned by get_next_batch
    # Returns a (numCases, imgSize, imgSize, 3) array which can be
    # fed to pylab for plotting.
    # This is used by shownet.py to plot test case predictions.
    def get_plottable_data(self, data):
        return n.require((data + self.data_mean).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
    def __trim_borders(self, x, target):
        y = x.reshape(3, 32, 32, x.shape[1])
        if self.test: # don't need to loop over cases
            if self.multiview:
                start_positions = [(0,0), (0, self.border_size), (0, self.border_size*2),
                                  (self.border_size, 0), (self.border_size, self.border_size), (self.border_size, self.border_size*2),
                                  (self.border_size*2, 0), (self.border_size*2, self.border_size), (self.border_size*2, self.border_size*2)]
                end_positions = [(sy+self.inner_size, sx+self.inner_size) for (sy,sx) in start_positions]
                for i in xrange(self.num_views):
                    target[:,i * x.shape[1]:(i+1)* x.shape[1]] = y[:,start_positions[i][0]:end_positions[i][0],start_positions[i][1]:end_positions[i][1],:].reshape((self.get_data_dims(),x.shape[1]))
            else:
                pic = y[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size, :] # just take the center for now
                target[:,:] = pic.reshape((self.get_data_dims(), x.shape[1]))
        else:
            for c in xrange(x.shape[1]): # loop over cases
                startY, startX = nr.randint(0,self.border_size*2 + 1), nr.randint(0,self.border_size*2 + 1)
                endY, endX = startY + self.inner_size, startX + self.inner_size
                pic = y[:,startY:endY,startX:endX, c]
                if nr.randint(2) == 0: # also flip the image with 50% probability
                    pic = pic[:,:,::-1]
                target[:,c] = pic.reshape((self.get_data_dims(),))
 class DummyConvNetDataProvider(LabeledDummyDataProvider):
    def __init__(self, data_dim):
        LabeledDummyDataProvider.__init__(self, data_dim)
    def get_next_batch(self):
        epoch, batchnum, dic = LabeledDummyDataProvider.get_next_batch(self)
        dic['data'] = n.require(dic['data'].T, requirements='C')
        dic['labels'] = n.require(dic['labels'].T, requirements='C')
        return epoch, batchnum, [dic['data'], dic['labels']]
    # Returns the dimensionality of the two data matrices returned by get_next_batch
    def get_data_dims(self, idx=0):
        return self.batch_meta['num_vis'] if idx == 0 else 1
--- a/convdata_flickr.py
+++ b/convdata_flickr.py
@ -0,0 +1,297 @@
 from data import *
 import numpy.random as nr
 import numpy as n
 import random as r
 from time import time
 from threading import Thread
 from math import sqrt
 import sys
 from pylab import *
 from PIL import Image
 from StringIO import StringIO
 class JPEGBatchLoaderThread(Thread):
    def __init__(self, data_dir, path, freq_to_id, tgt, tgt_labels, list_out):
        Thread.__init__(self)
        self.data_dir = data_dir
        self.path = path
        self.tgt = tgt
        self.tgt_labels = tgt_labels
        self.list_out = list_out
        self.freq_to_id = freq_to_id
        #print "loading %d" % self.bnum
    @staticmethod
    def raw_to_freq_id(raw_tags, freq_to_id):
        raw_tags = [''.join(t.lower().strip().split()) for t in raw_tags]
        return [freq_to_id[t] for t in raw_tags if t in freq_to_id]
    @staticmethod
    def load_jpeg_batch((strings, sizes, labels), freq_to_id, tgt, tgt_labels):
        tgt_labels[:] = 0
        for k,s in enumerate(strings):
            ima = n.asarray(Image.open(StringIO(s)).convert('RGB'))
            tgt[k,:] = ima.swapaxes(0,2).swapaxes(1,2).flatten()
            tgt_labels[k, JPEGBatchLoaderThread.raw_to_freq_id(labels[k], freq_to_id)] = 1
        return {'data': tgt[:len(strings),:],
                'labels': tgt_labels[:len(strings),:]}
    def run(self):
        p = self.load_jpeg_batch(unpickle(self.path),
                                 self.freq_to_id,
                                 self.tgt,
                                 self.tgt_labels)
        self.list_out.append(p)
 class ColorNoiseMakerThread(Thread):
    def __init__(self, pca_stdevs, pca_vecs, num_noise, list_out):
        Thread.__init__(self)
        self.pca_stdevs, self.pca_vecs = pca_stdevs, pca_vecs
        self.num_noise = num_noise
        self.list_out = list_out
    def run(self):
        noise = n.dot(nr.randn(self.num_noise, 3).astype(n.single) * self.pca_stdevs.T, self.pca_vecs.T)
        self.list_out.append(noise)
 class FlickrDP(LabeledDataProvider):
    MAX_PCA_COMPONENTS = 1024 # Use this many components for noise generation
    def __init__(self, data_dir, batch_range, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
        LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
        self.init_commons(data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
    def init_commons(self, data_dir, batch_range, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
        self.data_mean = self.batch_meta['data_mean'].astype(n.single)
        self.color_eig = self.batch_meta['color_pca'][1].astype(n.single)
        self.color_stdevs = n.c_[self.batch_meta['color_pca'][0].astype(n.single)]
        self.color_noise_coeff = dp_params['color_noise']
        self.pca_noise_coeff = dp_params['pca_noise']
        self.num_colors = 3
        self.img_size = int(sqrt(self.batch_meta['num_vis'] / self.num_colors))
        self.freq_to_id = self.batch_meta['freq_to_id']
    def get_labels(self, datadic):
        pass
    def showimg(self, img):
        pixels = img.shape[0] / 3
        size = int(sqrt(pixels))
        img = img.reshape((3,size,size)).swapaxes(0,2).swapaxes(0,1)
        imshow(img, interpolation='nearest')
        show()
    def get_next_batch(self):
        epoch, batchnum, datadic = LabeledDataProvider.get_next_batch(self)
        # This takes about 1 sec per batch :(
        # If I don't convert both to single ahead of time, it takes even longer.
        data = n.require(datadic['data'] - self.data_mean, dtype=n.single, requirements='C')
        labels = self.get_labels(datadic)
        # Labels have to be in the range 0-(number of classes - 1)
        assert labels.max() < self.get_num_classes(), "Invalid labels!"
        assert labels.min() >= 0, "Invalid labels!"
        return epoch, batchnum, [data, labels]
    # Takes as input an array returned by get_next_batch
    # Returns a (numCases, imgSize, imgSize, 3) array which can be
    # fed to pylab for plotting.
    # This is used by shownet.py to plot test case predictions.
    def get_plottable_data(self, data, add_mean=True):
        return n.require((data + (self.data_mean if add_mean else 0)).reshape(data.shape[0], 3, self.img_size, self.img_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
 class JPEGCroppedFlickrDP(FlickrDP):
    def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False):
        LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
        self.init_commons(data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
        self.img_size = int(sqrt(self.batch_meta['num_vis'] / self.num_colors))
        self.border_size = dp_params['crop_border']
        self.inner_size = self.img_size - self.border_size*2
        self.multiview = dp_params['multiview_test'] and test
        self.num_views = 5*2
        self.data_mult = self.num_views if self.multiview else 1
        self.crop_chunk = 32 # This many images will be cropped in the same way
        self.batch_size = self.batch_meta['batch_size']
        # Maintain poitners to previously-returned data matrices so they don't get garbage collected.
        # I've never seen this happen but it's a safety measure.
        self.data = [None, None]
        self.cropped_data = [n.zeros((0*self.data_mult, self.get_data_dims()), dtype=n.float32) for x in xrange(2)]
        if self.test:
            self.orig_data = [n.zeros((self.batch_size, self.img_size**2*3), dtype=n.uint8) for x in xrange(1)]
            self.orig_labels = [n.zeros((self.batch_size, self.get_num_classes()), dtype=n.float32) for x in xrange(2)]
        else:
            self.orig_data = [n.zeros((self.batch_size, self.img_size**2*3), dtype=n.uint8) for x in xrange(2)]
            # There have to be 3 copies of labels because this matrix actually gets used by the training code
            self.orig_labels = [n.zeros((self.batch_size, self.get_num_classes()), dtype=n.float32) for x in xrange(3)]
        self.loader_thread, self.color_noise_thread = None, None
        self.convnet = dp_params['convnet']
        self.num_noise = self.batch_size
        self.batches_generated, self.loaders_started = 0, 0
        self.data_mean_crop = self.data_mean.reshape((3,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((1,3*self.inner_size**2))
    def get_data_dims(self, idx=0):
        assert idx in (0,1), "Invalid index: %d" % idx
        if idx == 0:
            return self.inner_size**2 * 3
        return self.get_num_classes()
    def start_loader(self, batch_idx):
        self.load_data = []
        #print "loading %d" % self.batch_range_perm[self.batch_idx]
        self.loader_thread = JPEGBatchLoaderThread(self.data_dir, self.get_data_file_name(self.batch_range[batch_idx]), self.freq_to_id,
                                                   self.orig_data[self.loaders_started % 2], self.orig_labels[self.loaders_started % 3],
                                                   self.load_data)
        self.loader_thread.start()
        self.loaders_started += 1
    def start_color_noise_maker(self):
        color_noise_list = []
        self.color_noise_thread = ColorNoiseMakerThread(self.color_stdevs, self.color_eig, self.num_noise, color_noise_list)
        self.color_noise_thread.start()
        return color_noise_list
    def get_labels(self, datadic):
        pass
    def get_next_batch(self):
        self.d_idx = self.batches_generated % 2
        if self.test:
            epoch, batchnum, self.data[self.d_idx] = LabeledDataProvider.get_next_batch(self)
            self.data[self.d_idx] = JPEGBatchLoaderThread.load_jpeg_batch(self.data[self.d_idx], self.freq_to_id, self.orig_data[0], self.orig_labels[self.d_idx])
        else:
            epoch, batchnum = self.curr_epoch, self.curr_batchnum
            if self.loader_thread is None:
                self.start_loader(self.batch_idx)
                self.loader_thread.join()
                self.data[self.d_idx] = self.load_data[0]
                self.start_loader(self.get_next_batch_idx())
            else:
                # Set the argument to join to 0 to re-enable batch reuse
                self.loader_thread.join()
                if not self.loader_thread.is_alive():
                    self.data[self.d_idx] = self.load_data[0]
                    self.start_loader(self.get_next_batch_idx())
 #                else:
 #                    print "Re-using batch"
            self.advance_batch()
        cropped = self.get_cropped_data(self.data[self.d_idx])
        if self.color_noise_coeff > 0 and not self.test:
            # At this point the data already has 0 mean.
            # So I'm going to add noise to it, but I'm also going to scale down
            # the original data. This is so that the overall scale of the training
            # data doesn't become too different from the test data.
            s = cropped.shape
            cropped_size = self.get_data_dims(0) / 3
            ncases = s[0]
            if self.color_noise_thread is None:
                self.color_noise_list = self.start_color_noise_maker()
                self.color_noise_thread.join()
                self.color_noise = self.color_noise_list[0]
                self.color_noise_list = self.start_color_noise_maker()
            else:
                self.color_noise_thread.join(0)
                if not self.color_noise_thread.is_alive():
                    self.color_noise = self.color_noise_list[0]
                    self.color_noise_list = self.start_color_noise_maker()
            cropped = self.cropped_data[self.d_idx] = cropped.reshape((ncases*3, cropped_size))
            self.color_noise = self.color_noise[:ncases,:].reshape((3*ncases, 1))
            cropped += self.color_noise * self.color_noise_coeff
            cropped = self.cropped_data[self.d_idx] = cropped.reshape((ncases, 3* cropped_size))
            cropped /= (1.0 + self.color_noise_coeff)
        self.data[self.d_idx]['labels'] = self.get_labels(self.data[self.d_idx])
        self.data[self.d_idx]['data'] = cropped
        self.batches_generated += 1
 #        idx = 1000
 #        cropped -= cropped.min()
 #        cropped /= cropped.max()
 #        
 #        print [self.batch_meta['label_names'][i] for i in n.where(self.data['labels'][idx,:]==1)[0]]
 #        self.showimg(cropped[idx,:])
        #print cropped.shape
        return epoch, batchnum, [self.data[self.d_idx]['data'].T, self.data[self.d_idx]['labels'].T]
    def get_cropped_data(self, data):
        cropped = self.cropped_data[self.d_idx]
        if cropped.shape[0] != data['data'].shape[0] * self.data_mult:
            cropped = self.cropped_data[self.d_idx] = n.zeros((data['data'].shape[0] * self.data_mult, cropped.shape[1]), dtype=n.float32)
        self.__trim_borders(data['data'], cropped)
        return self.subtract_mean(cropped)
    def subtract_mean(self,data):
        data -= self.data_mean_crop
        return data
    # Takes as input an array returned by get_next_batch
    # Returns a (numCases, imgSize, imgSize, 3) array which can be
    # fed to pylab for plotting.
    # This is used by shownet.py to plot test case predictions.
    def get_plottable_data(self, data, add_mean=True):
        return n.require((data.T + (self.data_mean_crop if add_mean else 0)).reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
    def __trim_borders(self, x, target):
        y = x.reshape(x.shape[0], 3, self.img_size, self.img_size)
        if self.test: # don't need to loop over cases
            if self.multiview:
                start_positions = [(0,0),  (0, self.border_size*2),
                                   (self.border_size, self.border_size),
                                  (self.border_size*2, 0), (self.border_size*2, self.border_size*2)]
                end_positions = [(sy+self.inner_size, sx+self.inner_size) for (sy,sx) in start_positions]
                for i in xrange(self.num_views/2):
                    pic = y[:,:,start_positions[i][0]:end_positions[i][0],start_positions[i][1]:end_positions[i][1]]
                    target[i * x.shape[0]:(i+1)* x.shape[0],:] = pic.reshape((x.shape[0], self.get_data_dims()))
                    target[(self.num_views/2 + i) * x.shape[0]:(self.num_views/2 +i+1)* x.shape[0],:] = pic[:,:,:,::-1].reshape((x.shape[0],self.get_data_dims()))
            else:
                pic = y[:,:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size] # just take the center for now
                target[:,:] = pic.reshape((x.shape[0], self.get_data_dims()))
        else:
            for c in xrange(0, x.shape[0], self.crop_chunk): # loop over cases in chunks
                startY, startX = nr.randint(0,self.border_size*2 + 1), nr.randint(0,self.border_size*2 + 1)
                endY, endX = startY + self.inner_size, startX + self.inner_size
                c_end = min(c + self.crop_chunk, x.shape[0])
                pic = y[c:c_end,:,startY:endY,startX:endX]
                if nr.randint(2) == 0: # also flip the images with 50% probability
                    pic = pic[:,:,:,::-1]
                target[c:c_end,:] = pic.reshape((c_end-c, self.get_data_dims()))
            #target[:] = n.require(target[:,nr.permutation(x.shape[1])], requirements='C')
 class JPEGCroppedFlickrCEDP(JPEGCroppedFlickrDP):
    def __init__(self, data_dir, batch_range, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
        JPEGCroppedFlickrDP.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
    def get_labels(self, data):
        return n.require(n.tile(data['labels'], (self.data_mult, 1)), requirements='C')
 class DummyConvNetCEDP(LabeledDummyDataProvider):
    def __init__(self, data_dim):
        LabeledDummyDataProvider.__init__(self, data_dim, num_classes=16, num_cases=16)
    def get_next_batch(self):
        epoch, batchnum, dic = LabeledDummyDataProvider.get_next_batch(self)
        dic['data'] = n.require(dic['data'].T, requirements='F')
        dic['labels'] = n.zeros((self.get_data_dims(idx=1), dic['data'].shape[1]), dtype=n.float32, order='F')
        for c in xrange(dic['labels'].shape[1]): # loop over cases
            r = nr.randint(0, dic['labels'].shape[0])
            dic['labels'][r,c] = 1
        return epoch, batchnum, [dic['data'], dic['labels']]
    # Returns the dimensionality of the two data matrices returned by get_next_batch
    def get_data_dims(self, idx=0):
        return self.batch_meta['num_vis'] if idx == 0 else 16
--- a/convdata_jpeg.py
+++ b/convdata_jpeg.py
@ -0,0 +1,270 @@
 from data import *
 import numpy.random as nr
 import numpy as n
 import random as r
 from time import time
 from threading import Thread
 from math import sqrt
 import sys
 from pylab import *
 from PIL import Image
 from StringIO import StringIO
 from convdata import ImageNetDP
 class JPEGBatchLoaderThread(Thread):
    def __init__(self, data_dir, path, data_mean, no_crop, label_offset, tgt, list_out):
        Thread.__init__(self)
        self.data_dir = data_dir
        self.path = path
        self.tgt = tgt
        self.list_out = list_out
        self.label_offset = label_offset
        self.data_mean = data_mean
        self.no_crop = no_crop
        #print "loading %d" % self.bnum
    @staticmethod
    def load_jpeg_batch((strings, orig_sizes, labels), data_mean, no_crop, label_offset, tgt):
        lab_arr = n.zeros((len(strings), 1), dtype=n.single)
        failed = 0
        img256 = n.zeros((256, 256, 3), dtype=n.uint8) if no_crop else None
        for k,(s,l) in enumerate(zip(strings, labels)):
            try:
                ima = n.asarray(Image.open(StringIO(s)).convert('RGB'))
                if no_crop:
                    off_y, off_x = (256 - ima.shape[0]) / 2, (256 - ima.shape[1]) / 2
                    img256[:] = data_mean
                    img256[off_y:ima.shape[0]+off_y,off_x:ima.shape[1]+off_x,:] = ima
                    tgt[k - failed,:] = img256.swapaxes(0,2).swapaxes(1,2).flatten()
                else:
                    tgt[k - failed,:] = ima.swapaxes(0,2).swapaxes(1,2).flatten()
                # For the 2012 test set, the labels will be None
                lab_arr[k - failed,0] = 0 if l[1] is None else l[1] + label_offset
            except IOError:
                failed += 1
        return {'data': tgt[:len(strings) - failed,:],
                'labels': lab_arr[:len(strings) - failed,:]}
    def run(self):
        p = JPEGBatchLoaderThread.load_jpeg_batch(unpickle(self.path),
                                                  self.data_mean,
                                                  self.no_crop,
                                                  self.label_offset,
                                                  self.tgt)
        self.list_out.append(p)
 class ColorNoiseMakerThread(Thread):
    def __init__(self, pca_stdevs, pca_vecs, num_noise, list_out):
        Thread.__init__(self)
        self.pca_stdevs, self.pca_vecs = pca_stdevs, pca_vecs
        self.num_noise = num_noise
        self.list_out = list_out
    def run(self):
        noise = n.dot(nr.randn(self.num_noise, 3).astype(n.single) * self.pca_stdevs.T, self.pca_vecs.T)
        self.list_out.append(noise)
 class JPEGCroppedImageNetDP(ImageNetDP):
    def __init__(self, data_dir, batch_range=None, init_epoch=1, init_batchnum=None, dp_params=None, test=False):
        ImageNetDP.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
        self.mini = dp_params['minibatch_size']
        self.border_size = dp_params['crop_border']
        self.inner_size = self.img_size - self.border_size*2
        self.multiview = dp_params['multiview_test'] and test
        self.num_views = 5*2
        self.data_mult = self.num_views if self.multiview else 1
        self.crop_chunk = 32 # This many images will be cropped in the same way
        self.batch_size = self.batch_meta['batch_size']
        self.label_offset = 0 if 'label_offset' not in self.batch_meta else self.batch_meta['label_offset']
        self.no_crop = False if 'no_crop' not in self.batch_meta else self.batch_meta['no_crop']
        self.scalar_mean = 'scalar_mean' in dp_params and dp_params['scalar_mean'] 
        # Maintain poitners to previously-returned data matrices so they don't get garbage collected.
        # I've never seen this happen but it's a safety measure.
        self.data = [None, None] # These are pointers to previously-returned data matrices
        # This is where I crop data into
        self.cropped_data = [n.zeros((0*self.data_mult, self.get_data_dims()), dtype=n.float32) for x in xrange(2)] 
        # This is where I load data into (jpeg --> uint8)
        self.orig_data = [n.zeros((self.batch_size, self.img_size**2*3), dtype=n.uint8) for x in xrange(1 if test else 2)] 
        self.loader_thread, self.color_noise_thread = None, None
        self.convnet = dp_params['convnet']
        self.num_noise = self.batch_size
        self.batches_generated, self.loaders_started = 0, 0
        self.data_mean_crop = self.data_mean.reshape((3,self.img_size,self.img_size))[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size].reshape((1,3*self.inner_size**2))
        if self.no_crop or self.scalar_mean:
            self.data_mean_crop = self.data_mean.mean()
    def get_data_dims(self, idx=0):
        if idx == 0:
            return self.inner_size**2 * 3
        return 1
    def start_loader(self, batch_idx):
        self.load_data = []
        #print "loading %d" % self.batch_range_perm[self.batch_idx]
        self.loader_thread = JPEGBatchLoaderThread(self.data_dir,
                                                   self.get_data_file_name(self.batch_range[batch_idx]),
                                                   self.data_mean_crop,
                                                   self.no_crop,
                                                   self.label_offset,
                                                   self.orig_data[self.loaders_started],
                                                   self.load_data)
        self.loader_thread.start()
        self.loaders_started = (self.loaders_started + 1) % 2
    def start_color_noise_maker(self):
        color_noise_list = []
        self.color_noise_thread = ColorNoiseMakerThread(self.color_stdevs, self.color_eig, self.num_noise, color_noise_list)
        self.color_noise_thread.start()
        return color_noise_list
    def get_labels(self, datadic):
        pass
    def get_next_batch(self):
        self.d_idx = self.batches_generated % 2
        if self.test:
            epoch, batchnum, self.data[self.d_idx] = LabeledDataProvider.get_next_batch(self)
            self.data[self.d_idx] = JPEGBatchLoaderThread.load_jpeg_batch(self.data[self.d_idx],
                                                                          self.data_mean_crop,
                                                                          self.no_crop,
                                                                          self.label_offset,
                                                                          self.orig_data[0])
        else:
            epoch, batchnum = self.curr_epoch, self.curr_batchnum
            if self.loader_thread is None:
                self.start_loader(self.batch_idx)
                self.loader_thread.join()
                self.data[self.d_idx] = self.load_data[0]
                self.start_loader(self.get_next_batch_idx())
            else:
                # Set the argument to join to 0 to re-enable batch reuse
                self.loader_thread.join()
                if not self.loader_thread.is_alive():
                    self.data[self.d_idx] = self.load_data[0]
                    self.start_loader(self.get_next_batch_idx())
                #else:
                #    print "Re-using batch"
            self.advance_batch()
        cropped = self.get_cropped_data(self.data[self.d_idx])
        if self.color_noise_coeff > 0 and not self.test:
            # At this point the data already has 0 mean.
            # So I'm going to add noise to it, but I'm also going to scale down
            # the original data. This is so that the overall scale of the training
            # data doesn't become too different from the test data.
            s = cropped.shape
            cropped_size = self.get_data_dims(0) / 3
            ncases = s[0]
            if self.color_noise_thread is None:
                self.color_noise_list = self.start_color_noise_maker()
                self.color_noise_thread.join()
                self.color_noise = self.color_noise_list[0]
                self.color_noise_list = self.start_color_noise_maker()
            else:
                self.color_noise_thread.join(0)
                if not self.color_noise_thread.is_alive():
                    self.color_noise = self.color_noise_list[0]
                    self.color_noise_list = self.start_color_noise_maker()
            cropped = self.cropped_data[self.d_idx] = cropped.reshape((ncases*3, cropped_size))
            self.color_noise = self.color_noise[:ncases,:].reshape((3*ncases, 1))
            cropped += self.color_noise * self.color_noise_coeff
            cropped = self.cropped_data[self.d_idx] = cropped.reshape((ncases, 3* cropped_size))
            cropped /= (1.0 + self.color_noise_coeff)
        self.data[self.d_idx]['labels'] = self.get_labels(self.data[self.d_idx])
        self.data[self.d_idx]['data'] = cropped
        self.batches_generated += 1
        if False and not self.test:
            idx = 111
            cropped -= cropped.min()
            cropped /= cropped.max()
            label = int(self.data[self.d_idx]['labels'][idx,0])
            print label
            print self.batch_meta['label_names'][label]
            print cropped.max(), cropped.min()
            print self.data[self.d_idx]['labels']
            self.showimg(cropped[idx,:])
        # NOTE: It would be good to add some logic here to pad irregularly-sized
        # batches by duplicating training cases. 
        return epoch, batchnum, [self.data[self.d_idx]['data'].T, self.data[self.d_idx]['labels'].T]
    def get_cropped_data(self, data):
        cropped = self.cropped_data[self.d_idx]
        if cropped.shape[0] != data['data'].shape[0] * self.data_mult:
            cropped = self.cropped_data[self.d_idx] = n.zeros((data['data'].shape[0] * self.data_mult, cropped.shape[1]), dtype=n.float32)
        self.__trim_borders(data['data'], cropped)
        return self.subtract_mean(cropped)
    def subtract_mean(self,data):
        data -= self.data_mean_crop
        return data
    # Takes as input an array returned by get_next_batch
    # Returns a (numCases, imgSize, imgSize, 3) array which can be
    # fed to pylab for plotting.
    # This is used by shownet.py to plot test case predictions.
    def get_plottable_data(self, data, add_mean=True):
        mean = self.data_mean_crop if data.flags.f_contiguous or self.scalar_mean else self.data_mean_crop.T
        return n.require((data + (mean if add_mean else 0)).T.reshape(data.shape[1], 3, self.inner_size, self.inner_size).swapaxes(1,3).swapaxes(1,2) / 255.0, dtype=n.single)
    def __trim_borders(self, x, target):
        y = x.reshape(x.shape[0], 3, self.img_size, self.img_size)
        if self.test: # don't need to loop over cases
            if self.multiview:
                start_positions = [(0,0),  (0, self.border_size*2),
                                   (self.border_size, self.border_size),
                                  (self.border_size*2, 0), (self.border_size*2, self.border_size*2)]
                end_positions = [(sy+self.inner_size, sx+self.inner_size) for (sy,sx) in start_positions]
                for i in xrange(self.num_views/2):
                    pic = y[:,:,start_positions[i][0]:end_positions[i][0],start_positions[i][1]:end_positions[i][1]]
                    target[i * x.shape[0]:(i+1)* x.shape[0],:] = pic.reshape((x.shape[0], self.get_data_dims()))
                    target[(self.num_views/2 + i) * x.shape[0]:(self.num_views/2 +i+1)* x.shape[0],:] = pic[:,:,:,::-1].reshape((x.shape[0],self.get_data_dims()))
            else:
                pic = y[:,:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size] # just take the center for now
                target[:,:] = pic.reshape((x.shape[0], self.get_data_dims()))
        else:
            for c in xrange(0, x.shape[0], self.crop_chunk): # loop over cases in chunks
                startY, startX = nr.randint(0,self.border_size*2 + 1), nr.randint(0,self.border_size*2 + 1)
                endY, endX = startY + self.inner_size, startX + self.inner_size
                c_end = min(c + self.crop_chunk, x.shape[0])
                pic = y[c:c_end,:,startY:endY,startX:endX]
                if nr.randint(2) == 0: # also flip the images with 50% probability
                    pic = pic[:,:,:,::-1]
                target[c:c_end,:] = pic.reshape((c_end-c, self.get_data_dims()))
                # With 5% chance, replace this chunk with the average of this chunk and some future chunk
                #if c >= self.crop_chunk and nr.rand() < 0.05:
                    #r = nr.randint(0, c - self.crop_chunk + 1)
                    #r_end = r + self.crop_chunk
                    #target[c:c_end,:] = 0.75 * target[c:c_end,:] + 0.25 * target[r:r_end,:]
                    #print "faded in past batch (%d,%d) to batch (%d,%d)" % (r, r_end, c, c_end)
            #for c in xrange(0, x.shape[0]-self.crop_chunk, self.crop_chunk): # loop over cases in chunks
            #    if nr.rand() < 0.05:
            #        c_end = min(c + self.crop_chunk, x.shape[0])
            #        r = nr.randint(c, x.shape[0] - self.crop_chunk+1)
            #        r_end = r + self.crop_chunk
            #        target[c:c_end,:] = 0.75 * target[c:c_end,:] + 0.25 * target[r:r_end,:]
                    #print "faded in past batch (%d,%d) to batch (%d,%d)" % (r, r_end, c, c_end)
            #target[:] = n.require(target[:,nr.permutation(x.shape[1])], requirements='C')
 class JPEGCroppedImageNetLogRegDP(JPEGCroppedImageNetDP):
    def __init__(self, data_dir, batch_range, init_epoch=1, init_batchnum=None, dp_params={}, test=False):
        JPEGCroppedImageNetDP.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
    def get_labels(self, data):
        return n.require(n.tile(n.array(data['labels'], dtype=n.single).reshape((data['data'].shape[0], 1)), (self.data_mult, 1)), requirements='C')
--- a/convnet.py
+++ b/convnet.py
@ -0,0 +1,218 @@
 import numpy as n
 import numpy.random as nr
 import random as r
 from util import *
 from data import *
 from options import *
 from gpumodel import *
 import sys
 import math as m
 import layer as lay
 from convdata import *
 from convdata_jpeg import JPEGCroppedImageNetLogRegDP
 from convdata_flickr import JPEGCroppedFlickrCEDP, DummyConvNetCEDP
 from convdata_cifar import CIFARDataProvider, CroppedCIFARDataProvider
 from os import linesep as NL
 import pylab as pl
 import copy as cp
 class ConvNet(IGPUModel):
    def __init__(self, op, load_dic, dp_params={}):
        filename_options = []
        for v in ('color_noise', 'pca_noise', 'multiview_test', 'crop_border', 'scalar_mean', 'minibatch_size'):
            dp_params[v] = op.get_value(v)
        IGPUModel.__init__(self, "ConvNet", op, load_dic, filename_options, dp_params=dp_params)
        self.writing_test = False
    def import_model(self):
        lib_name = "_ConvNet_k20x" if is_kepler_machine() else "_ConvNet"
        print "========================="
        print "Importing %s C++ module" % lib_name
        self.libmodel = __import__(lib_name) 
    def init_model_lib(self):
        self.libmodel.initModel(self.layers, self.device_ids, self.device_cpus, self.minibatch_size, self.wupdate_freq)
    def init_model_state(self):
        ms = self.model_state
        if self.load_file:
            ms['layers'] = lay.LayerParser.parse_layers(self.layer_def, self.layer_params, self, ms['layers'])
        else:    
            ms['layers'] = lay.LayerParser.parse_layers(self.layer_def, self.layer_params, self)
        # Convert convolutional layers to local
        if len(self.op.get_value('conv_to_local')) > 0:
            for i, layer in enumerate(ms['layers']):
                if layer['type'] == 'conv' and layer['name'] in self.op.get_value('conv_to_local'):
                    lay.LocalLayerParser.conv_to_local(ms['layers'], i)
        # Decouple weight matrices
        if len(self.op.get_value('unshare_weights')) > 0:
            for name_str in self.op.get_value('unshare_weights'):
                if name_str:
                    name = lay.WeightLayerParser.get_layer_name(name_str)
                    if name is not None:
                        name, idx = name[0], name[1]
                        if name not in ms['layers']:
                            raise ModelStateException("Layer '%s' does not exist; unable to unshare" % name)
                        layer = ms['layers'][name]
                        lay.WeightLayerParser.unshare_weights(layer, ms['layers'], matrix_idx=idx)
                    else:
                        raise ModelStateException("Invalid layer name '%s'; unable to unshare." % name_str)
        self.op.set_value('conv_to_local', [], parse=False)
        self.op.set_value('unshare_weights', [], parse=False)
        self.writing_test = False
    def get_layer_idx(self, layer_name, check_type=[]):
        try:
            layer_idx = [l['name'] for l in self.model_state['layers']].index(layer_name)
            if check_type:
                layer_type = self.model_state['layers'][layer_idx]['type']
                if layer_type not in check_type:
                    raise ModelStateException("Layer with name '%s' has type '%s'; should be one of %s." % (layer_name, layer_type, ",".join("'%s'" %s for s in check_type)))
            return layer_idx
        except ValueError:
            raise ModelStateException("Layer with name '%s' not defined." % layer_name)
    def fill_excused_options(self):
        if self.op.get_value('check_grads'):
            self.op.set_value('save_path', '')
            self.op.set_value('train_batch_range', '0')
            self.op.set_value('test_batch_range', '0')
            self.op.set_value('data_path', '')
    # Make sure the data provider returned data in proper format
    def parse_batch_data(self, batch_data, train=True):
        if max(d.dtype != n.single for d in batch_data[2]):
            raise DataProviderException("All matrices returned by data provider must consist of single-precision floats.")
        return batch_data
    def start_batch(self, batch_data, train=True):
        data = batch_data[2]
        self.writing_test = False
        if self.check_grads:
            self.libmodel.checkGradients(data)
        elif not train and self.multiview_test:
            num_views = self.test_data_provider.num_views
            if self.test_out != "" and self.logreg_name != "":
                self.writing_test = True
                self.test_file_name = os.path.join(self.test_out, 'test_preds_%d' % batch_data[1])
                self.probs = n.zeros((data[0].shape[1]/num_views, self.test_data_provider.get_num_classes()), dtype=n.single)
                self.libmodel.startMultiviewTest(data, num_views, self.probs, self.logreg_name)
            else:
                self.libmodel.startMultiviewTest(data, num_views)
        else:
            num_batches_total = self.num_epochs * len(self.train_batch_range)
            progress = min(1.0, max(0.0, float(self.get_num_batches_done()-1) / num_batches_total))
            self.libmodel.startBatch(data, progress, not train)
    def finish_batch(self):
        ret = IGPUModel.finish_batch(self)
        if self.writing_test:
            if not os.path.exists(self.test_out):
                os.makedirs(self.test_out)
            pickle(self.test_file_name,  {'data': self.probs,
                                          'note': 'generated from %s' % self.save_file})
        return ret
    def print_iteration(self):
        print "%d.%d..." % (self.epoch, self.batchnum),
    def print_train_time(self, compute_time_py):
        print "(%.3f sec)" % (compute_time_py)
    def print_costs(self, cost_outputs):
        costs, num_cases = cost_outputs[0], cost_outputs[1]
        for errname in costs.keys():
            costs[errname] = [(v/num_cases) for v in costs[errname]]
            print "%s: " % errname,
            print ", ".join("%.6f" % v for v in costs[errname]),
            if sum(m.isnan(v) for v in costs[errname]) > 0 or sum(m.isinf(v) for v in costs[errname]):
                print "^ got nan or inf!"
                sys.exit(1)
    def print_train_results(self):
        self.print_costs(self.train_outputs[-1])
    def print_test_status(self):
        pass
    def print_test_results(self):
        print NL + "======================Test output======================"
        self.print_costs(self.test_outputs[-1])
        print NL + "----------------------Averages-------------------------"
        self.print_costs((self.aggregate_test_outputs(self.test_outputs[-len(self.test_batch_range):])[0], min(len(self.test_outputs), len(self.test_batch_range))))
        print NL + "-------------------------------------------------------",
        for name in sorted(self.layers.keys()): # This is kind of hacky but will do for now.
            l = self.layers[name]
            if 'weights' in l:
                if type(l['weights']) == n.ndarray:
                    print "%sLayer '%s' weights: %e [%e]" % (NL, l['name'], n.mean(n.abs(l['weights'])), n.mean(n.abs(l['weightsInc']))),
                elif type(l['weights']) == list:
                    print ""
                    print NL.join("Layer '%s' weights[%d]: %e [%e]" % (l['name'], i, n.mean(n.abs(w)), n.mean(n.abs(wi))) for i,(w,wi) in enumerate(zip(l['weights'],l['weightsInc']))),
                print "%sLayer '%s' biases: %e [%e]" % (NL, l['name'], n.mean(n.abs(l['biases'])), n.mean(n.abs(l['biasesInc']))),
        print ""
    def conditional_save(self):
        self.save_state()
        print "-------------------------------------------------------"
        print "Saved checkpoint to %s" % os.path.join(self.save_path, self.save_file)
        print "=======================================================",
    def aggregate_test_outputs(self, test_outputs):
        test_outputs = cp.deepcopy(test_outputs)
        num_cases = sum(t[1] for t in test_outputs)
        for i in xrange(1 ,len(test_outputs)):
            for k,v in test_outputs[i][0].items():
                for j in xrange(len(v)):
                    test_outputs[0][0][k][j] += test_outputs[i][0][k][j]
        return (test_outputs[0][0], num_cases)
    @classmethod
    def get_options_parser(cls):
        op = IGPUModel.get_options_parser()
        op.add_option("mini", "minibatch_size", IntegerOptionParser, "Minibatch size", default=128)
        op.add_option("layer-def", "layer_def", StringOptionParser, "Layer definition file", set_once=True)
        op.add_option("layer-params", "layer_params", StringOptionParser, "Layer parameter file")
        op.add_option("check-grads", "check_grads", BooleanOptionParser, "Check gradients and quit?", default=0, excuses=['data_path','save_path','train_batch_range','test_batch_range'])
        op.add_option("multiview-test", "multiview_test", BooleanOptionParser, "Cropped DP: test on multiple patches?", default=0)
        op.add_option("crop-border", "crop_border", IntegerOptionParser, "Cropped DP: crop border size", default=4, set_once=True)
        op.add_option("conv-to-local", "conv_to_local", ListOptionParser(StringOptionParser), "Convert given conv layers to unshared local", default=[])
        op.add_option("unshare-weights", "unshare_weights", ListOptionParser(StringOptionParser), "Unshare weight matrices in given layers", default=[])
        op.add_option("conserve-mem", "conserve_mem", BooleanOptionParser, "Conserve GPU memory (slower)?", default=0)
        op.add_option("color-noise", "color_noise", FloatOptionParser, "Add PCA noise to color channels with given scale", default=0.0)
        op.add_option("test-out", "test_out", StringOptionParser, "Output test case predictions to given path", default="", requires=['logreg_name', 'multiview_test'])
        op.add_option("logreg-name", "logreg_name", StringOptionParser, "Logreg cost layer name (for --test-out)", default="")
        op.add_option("pca-noise", "pca_noise", FloatOptionParser, "Add PCA noise to pixels with given scale", default=0.0)
        op.add_option("scalar-mean", "scalar_mean", FloatOptionParser, "Subtract scalar pixel mean (as opposed to vector)?", default=False)
        op.add_option("wupdate-freq", "wupdate_freq", IntegerOptionParser, "Weight update (inverse) frequency, in minibatches (1 = every minibatch)", default=1)
        op.delete_option('max_test_err')
        op.options["max_filesize_mb"].default = 0
        op.options["testing_freq"].default = 50
        op.options["num_epochs"].default = 50000
        op.options['dp_type'].default = None
        DataProvider.register_data_provider('dummy-lr-n', 'Dummy ConvNet logistic regression', DummyConvNetLogRegDP)
        DataProvider.register_data_provider('inet-lr', 'ImageNet logistic regression', ImageNetLogRegDP)
        DataProvider.register_data_provider('inet-lr-cropped', 'ImageNet logistic regression cropped', CroppedImageNetLogRegDP)
        DataProvider.register_data_provider('inet-lr-cropped-jpeg', 'ImageNet logistic regression cropped JPEG', JPEGCroppedImageNetLogRegDP)
        DataProvider.register_data_provider('inet-rs-lr-cropped', 'Random scale cropped ImageNet logistic regression', RandomScaleImageNetLogRegDP)
        DataProvider.register_data_provider('flickr-ce-cropped', 'Flickr cross-entropy cropped', JPEGCroppedFlickrCEDP)
        DataProvider.register_data_provider('dummy-ce-n', 'Dummy cross-entropy', DummyConvNetCEDP)
        DataProvider.register_data_provider('flatmem', 'Flat memory', FlatMemoryDataProvider)
        DataProvider.register_data_provider('cifar', 'CIFAR', CIFARDataProvider)
        DataProvider.register_data_provider('cifar-cropped', 'Cropped CIFAR', CroppedCIFARDataProvider)
        return op
 if __name__ == "__main__":
    #nr.seed(5)
    op = ConvNet.get_options_parser()
    op, load_dic = IGPUModel.parse_options(op)
    model = ConvNet(op, load_dic)
    model.start()
--- a/deviceQuery.txt
+++ b/deviceQuery.txt
@ -0,0 +1,143 @@
 /u/kriz/NVIDIA_GPU_Computing_SDK/C/bin/linux/release/deviceQuery Starting...
 CUDA Device Query (Runtime API) version (CUDART static linking)
 Found 4 CUDA Capable device(s)
 Device 0: "Tesla S2050"
  CUDA Driver Version / Runtime Version          4.2 / 4.2
  CUDA Capability Major/Minor version number:    2.0
  Total amount of global memory:                 3072 MBytes (3220897792 bytes)
  (14) Multiprocessors x ( 32) CUDA Cores/MP:    448 CUDA Cores
  GPU Clock rate:                                1147 MHz (1.15 GHz)
  Memory Clock rate:                             1546 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 786432 bytes
  Max Texture Dimension Size (x,y,z)             1D=(65536), 2D=(65536,65535), 3D=(2048,2048,2048)
  Max Layered Texture Size (dim) x layers        1D=(16384) x 2048, 2D=(16384,16384) x 2048
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 32768
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     65535 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and execution:                 Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Concurrent kernel execution:                   Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support enabled:                No
  Device is using TCC driver mode:               No
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Bus ID / PCI location ID:           7 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
 Device 1: "Tesla S2050"
  CUDA Driver Version / Runtime Version          4.2 / 4.2
  CUDA Capability Major/Minor version number:    2.0
  Total amount of global memory:                 3072 MBytes (3220897792 bytes)
  (14) Multiprocessors x ( 32) CUDA Cores/MP:    448 CUDA Cores
  GPU Clock rate:                                1147 MHz (1.15 GHz)
  Memory Clock rate:                             1546 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 786432 bytes
  Max Texture Dimension Size (x,y,z)             1D=(65536), 2D=(65536,65535), 3D=(2048,2048,2048)
  Max Layered Texture Size (dim) x layers        1D=(16384) x 2048, 2D=(16384,16384) x 2048
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 32768
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     65535 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and execution:                 Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Concurrent kernel execution:                   Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support enabled:                No
  Device is using TCC driver mode:               No
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Bus ID / PCI location ID:           8 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
 Device 2: "Tesla S2050"
  CUDA Driver Version / Runtime Version          4.2 / 4.2
  CUDA Capability Major/Minor version number:    2.0
  Total amount of global memory:                 3072 MBytes (3220897792 bytes)
  (14) Multiprocessors x ( 32) CUDA Cores/MP:    448 CUDA Cores
  GPU Clock rate:                                1147 MHz (1.15 GHz)
  Memory Clock rate:                             1546 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 786432 bytes
  Max Texture Dimension Size (x,y,z)             1D=(65536), 2D=(65536,65535), 3D=(2048,2048,2048)
  Max Layered Texture Size (dim) x layers        1D=(16384) x 2048, 2D=(16384,16384) x 2048
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 32768
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     65535 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and execution:                 Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Concurrent kernel execution:                   Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support enabled:                No
  Device is using TCC driver mode:               No
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Bus ID / PCI location ID:           16 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
 Device 3: "Tesla S2050"
  CUDA Driver Version / Runtime Version          4.2 / 4.2
  CUDA Capability Major/Minor version number:    2.0
  Total amount of global memory:                 3072 MBytes (3220897792 bytes)
  (14) Multiprocessors x ( 32) CUDA Cores/MP:    448 CUDA Cores
  GPU Clock rate:                                1147 MHz (1.15 GHz)
  Memory Clock rate:                             1546 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 786432 bytes
  Max Texture Dimension Size (x,y,z)             1D=(65536), 2D=(65536,65535), 3D=(2048,2048,2048)
  Max Layered Texture Size (dim) x layers        1D=(16384) x 2048, 2D=(16384,16384) x 2048
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 32768
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     65535 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and execution:                 Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Concurrent kernel execution:                   Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support enabled:                No
  Device is using TCC driver mode:               No
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Bus ID / PCI location ID:           17 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
 deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 4.2, CUDA Runtime Version = 4.2, NumDevs = 4, Device = Tesla S2050, Device = Tesla S2050
--- a/example-layers/layer-params-18pct.cfg
+++ b/example-layers/layer-params-18pct.cfg
@ -0,0 +1,35 @@
 # 18% error on CIFAR-10 in 20 minutes - layer definition file 
 # Reduce all learning rates by factor of 10 after 120 epochs.
 # Then another factor of 10 after 10 more epochs.
 [conv1]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [conv2]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [conv3]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [fc10]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=1
 [logprob]
 coeff=1
--- a/example-layers/layer-params-19pct.cfg
+++ b/example-layers/layer-params-19pct.cfg
@ -0,0 +1,33 @@
 # 19% error on CIFAR-10 in 20 minutes - layer parameter file 
 # Set wc to 0 for translations -- 14.2%
 [conv1]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [conv2]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [conv3]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [fc10]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=3
 [logprob]
 coeff=1
--- a/example-layers/layer-params-80sec.cfg
+++ b/example-layers/layer-params-80sec.cfg
@ -0,0 +1,39 @@
 # 26% error on CIFAR-10 in 80 seconds - layer parameter file 
 [conv1]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [conv2]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [conv3]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [fc64]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=.03
 [fc10]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=.03
 [logprob]
 coeff=1
--- a/example-layers/layer-params-conv-local-12pct.cfg
+++ b/example-layers/layer-params-conv-local-12pct.cfg
@ -0,0 +1,40 @@
 # 12% error on CIFAR-10 - layer parameter file 
 # See methodology: http://code.google.com/p/cuda-convnet/wiki/Methodology
 [conv1]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.00
 [conv2]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.00
 [local3]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [local4]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [fc10]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [logprob]
 coeff=1
--- a/example-layers/layer-params-conv-local-13pct.cfg
+++ b/example-layers/layer-params-conv-local-13pct.cfg
@ -0,0 +1,40 @@
 # 13% error on CIFAR-10 - layer parameter file 
 # See methodology: http://code.google.com/p/cuda-convnet/wiki/Methodology
 [conv1]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.00
 [conv2]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.00
 [local3]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [local4]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [fc10]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 [logprob]
 coeff=1
--- a/example-layers/layer-params-example.cfg
+++ b/example-layers/layer-params-example.cfg
@ -0,0 +1,44 @@
 [conv32]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0
 [local32]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0
 [fc1024]
 momW=0.9
 momB=0.9
 epsW=0.00001
 epsB=0.00002
 wc=0
 [conv32-2]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0
 [conv32-3]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0
 [fc10]
 epsW=0.0001,0.001
 epsB=0.002
 momW=0.5,0.9
 momB=0.9
 wc=0,0
 [logprob]
 coeff=1
--- a/example-layers/layer-params.gc.cfg
+++ b/example-layers/layer-params.gc.cfg
@ -0,0 +1,66 @@
 [conv32a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0
 [conv32b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0
 [conv32c]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0
 [fc10]
 wc=0
 momB=0
 momW=0
 epsW=0.00001
 epsB=0.00002
 [fc16a]
 wc=0,0,0
 momB=0
 momW=0,0,0
 epsW=0.00001,0.1,0.1
 epsB=0.00002
 [fc16b]
 wc=0,0,0
 momB=0
 momW=0,0,0
 epsW=0.00001,0.1,0.1
 epsB=0.00002
 [fc16c]
 wc=0,0,0
 momB=0
 momW=0,0,0
 epsW=0.00001,0.1,0.1
 epsB=0.00002
 [logreg]
 coeff=1
 [rnorm1a]
 scale=0.0001
 pow=0.75
 minDiv=0.25
 [rnorm1b]
 scale=0.0001
 pow=0.75
 minDiv=0.25
 [rnorm1c]
 scale=0.0001
 pow=0.75
 minDiv=0.25
--- a/example-layers/layers-18pct.cfg
+++ b/example-layers/layers-18pct.cfg
@ -0,0 +1,109 @@
 # 18% error on CIFAR-10 in 20 minutes - layer definition file 
 [data]
 type=data
 dataIdx=0
 [labels]
 type=data
 dataIdx=1
 [conv1]
 type=conv
 inputs=data
 channels=3
 filters=32
 padding=2
 stride=1
 filterSize=5
 neuron=relu
 initW=0.0001
 partialSum=4
 sharedBiases=1
 [pool1]
 type=pool
 pool=max
 inputs=conv1
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=32
 [rnorm1]
 type=rnorm
 inputs=pool1
 channels=32
 sizeX=3
 scale=0.00005
 pow=.75
 [conv2]
 type=conv
 inputs=rnorm1
 filters=32
 padding=2
 stride=1
 filterSize=5
 channels=32
 neuron=relu
 initW=0.01
 partialSum=4
 sharedBiases=1
 [pool2]
 type=pool
 pool=avg
 inputs=conv2
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=32
 [rnorm2]
 type=rnorm
 inputs=pool2
 channels=32
 sizeX=3
 scale=0.00005
 pow=.75
 [conv3]
 type=conv
 inputs=rnorm2
 filters=64
 padding=2
 stride=1
 filterSize=5
 channels=32
 neuron=relu
 initW=0.01
 partialSum=4
 sharedBiases=1
 [pool3]
 type=pool
 pool=avg
 inputs=conv3
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=64
 [fc10]
 type=fc
 outputs=10
 inputs=pool3
 initW=0.01
 neuron=ident
 [probs]
 type=softmax
 inputs=fc10
 [logprob]
 type=cost.logreg
 inputs=labels,probs
--- a/example-layers/layers-19pct.cfg
+++ b/example-layers/layers-19pct.cfg
@ -0,0 +1,93 @@
 # 19% error on CIFAR-10 in 20 minutes - layer definition file 
 [data]
 type=data
 dataIdx=0
 [labels]
 type=data
 dataIdx=1
 [conv1]
 type=conv
 inputs=data
 channels=3
 filters=32
 padding=2
 stride=1
 filterSize=5
 neuron=relu
 initW=0.0001
 partialSum=1
 sharedBiases=1
 [pool1]
 type=pool
 pool=max
 inputs=conv1
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=32
 [conv2]
 type=conv
 inputs=pool1
 filters=32
 padding=2
 stride=1
 filterSize=5
 channels=32
 neuron=relu
 initW=0.01
 partialSum=1
 sharedBiases=1
 [pool2]
 type=pool
 pool=avg
 inputs=conv2
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=32
 [conv3]
 type=conv
 inputs=pool2
 filters=64
 padding=2
 stride=1
 filterSize=5
 channels=32
 neuron=relu
 initW=0.01
 partialSum=1
 sharedBiases=1
 [pool3]
 type=pool
 pool=avg
 inputs=conv3
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=64
 [fc10]
 type=fc
 outputs=10
 inputs=pool3
 initW=0.01
 neuron=ident
 [probs]
 type=softmax
 inputs=fc10
 [logprob]
 type=cost.logreg
 inputs=labels,probs
--- a/example-layers/layers-80sec.cfg
+++ b/example-layers/layers-80sec.cfg
@ -0,0 +1,100 @@
 # 26% error on CIFAR-10 in 80 seconds - layer definition file 
 [data]
 type=data
 dataIdx=0
 [labels]
 type=data
 dataIdx=1
 [conv1]
 type=conv
 inputs=data
 channels=3
 filters=32
 padding=2
 stride=1
 filterSize=5
 neuron=relu
 initW=0.0001
 partialSum=4
 sharedBiases=1
 [pool1]
 type=pool
 pool=max
 inputs=conv1
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=32
 [conv2]
 type=conv
 inputs=pool1
 filters=32
 padding=2
 stride=1
 filterSize=5
 channels=32
 neuron=relu
 initW=0.01
 partialSum=4
 sharedBiases=1
 [pool2]
 type=pool
 pool=avg
 inputs=conv2
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=32
 [conv3]
 type=conv
 inputs=pool2
 filters=64
 padding=2
 stride=1
 filterSize=5
 channels=32
 neuron=relu
 initW=0.01
 partialSum=4
 sharedBiases=1
 [pool3]
 type=pool
 pool=avg
 inputs=conv3
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=64
 [fc64]
 type=fc
 outputs=64
 inputs=pool3
 initW=0.1
 neuron=relu
 [fc10]
 type=fc
 outputs=10
 inputs=fc64
 initW=0.1
 neuron=ident
 [probs]
 type=softmax
 inputs=fc10
 [logprob]
 type=cost.logreg
 inputs=labels,probs
--- a/example-layers/layers-conv-local-12pct.cfg
+++ b/example-layers/layers-conv-local-12pct.cfg
@ -0,0 +1,92 @@
 # 19% error on CIFAR-10 in 20 minutes - layer definition file 
 [data]
 type=data
 dataIdx=0
 [labels]
 type=data
 dataIdx=1
 [conv1]
 type=conv
 inputs=data
 channels=3
 filters=64
 padding=2
 stride=1
 filterSize=5
 neuron=relu
 initW=0.0001
 partialSum=4
 sharedBiases=1
 [pool1]
 type=pool
 pool=max
 inputs=conv1
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=64
 [conv2]
 type=conv
 inputs=pool1
 filters=64
 padding=2
 stride=1
 filterSize=5
 channels=64
 neuron=relu
 initW=0.01
 partialSum=8
 sharedBiases=1
 [pool2]
 type=pool
 pool=max
 inputs=conv2
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=64
 [local3]
 type=local
 inputs=pool2
 filters=32
 padding=1
 stride=1
 filterSize=3
 channels=64
 neuron=relu
 initW=0.04
 [local4]
 type=local
 inputs=local3
 filters=32
 padding=1
 stride=1
 filterSize=3
 channels=32
 neuron=relu
 initW=0.04
 [fc10]
 type=fc
 outputs=10
 inputs=local4
 initW=0.01
 neuron=ident
 [probs]
 type=softmax
 inputs=fc10
 [logprob]
 type=cost.logreg
 inputs=labels,probs
--- a/example-layers/layers-conv-local-13pct.cfg
+++ b/example-layers/layers-conv-local-13pct.cfg
@ -0,0 +1,93 @@
 # 13% error on CIFAR-10 in 20 minutes - layer definition file 
 # See methodology: http://code.google.com/p/cuda-convnet/wiki/Methodology
 [data]
 type=data
 dataIdx=0
 [labels]
 type=data
 dataIdx=1
 [conv1]
 type=conv
 inputs=data
 channels=3
 filters=64
 padding=2
 stride=1
 filterSize=5
 neuron=relu
 initW=0.0001
 partialSum=4
 sharedBiases=1
 [pool1]
 type=pool
 pool=max
 inputs=conv1
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=64
 [conv2]
 type=conv
 inputs=pool1
 filters=64
 padding=2
 stride=1
 filterSize=5
 channels=64
 neuron=relu
 initW=0.01
 partialSum=8
 sharedBiases=1
 [pool2]
 type=pool
 pool=max
 inputs=conv2
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=64
 [local3]
 type=local
 inputs=pool2
 filters=32
 padding=1
 stride=1
 filterSize=3
 channels=64
 neuron=relu
 initW=0.04
 [local4]
 type=local
 inputs=local3
 filters=32
 padding=1
 stride=1
 filterSize=3
 channels=32
 neuron=relu
 initW=0.04
 [fc10]
 type=fc
 outputs=10
 inputs=local4
 initW=0.01
 neuron=ident
 [probs]
 type=softmax
 inputs=fc10
 [logprob]
 type=cost.logreg
 inputs=labels,probs
--- a/example-layers/layers-example.cfg
+++ b/example-layers/layers-example.cfg
@ -0,0 +1,115 @@
 # This is a layer configuration file that contains all the 
 # layer types supported by this code. It's not actually good for anything
 # other than demonstrating how layers are specified and connected to one another.
 # Note: this file has gotten so big that the resultant net will not run on anything short of a 3GB GTX 580.
 # But there's no particular reason to run the net specified by this file. It's not actually good.
 [data]
 type=data
 dataIdx=0
 [labels]
 type=data
 dataIdx=1
 [conv32]
 type=conv
 inputs=data
 channels=3
 filters=32
 padding=4
 stride=1
 filterSize=9
 neuron=logistic
 initW=0.00001
 partialSum=1
 sharedBiases=true
 [local32]
 type=local
 inputs=conv32
 channels=32
 filters=32
 padding=4
 stride=1
 filterSize=9
 neuron=logistic
 initW=0.00001
 [fc1024]
 type=fc
 outputs=1024
 inputs=data
 initW=0.001
 neuron=relu
 [maxpool]
 type=pool
 pool=max
 inputs=local32
 start=0
 sizeX=4
 stride=2
 outputsX=0
 channels=32
 [rnorm1]
 type=rnorm
 inputs=maxpool
 channels=32
 sizeX=5
 scale=0.0000125
 pow=0.75
 [cnorm1]
 type=cnorm
 inputs=rnorm1
 channels=32
 sizeX=7
 scale=0.001
 pow=0.5
 [conv32-2]
 type=conv
 inputs=cnorm1
 groups=4
 channels=32
 filters=32
 padding=2
 stride=1
 filterSize=5
 neuron=relu
 initW=0.0001
 partialSum=1
 sharedBiases=false
 [conv32-3]
 type=conv
 inputs=conv32-2
 groups=4
 channels=128
 filters=32
 padding=2
 stride=2
 filterSize=5
 neuron=relu
 initW=0.0001
 partialSum=1
 randSparse=true
 filterChannels=64
 [fc10]
 type=fc
 outputs=10
 inputs=conv32-3,fc1024
 initW=0.0001,0.0001
 neuron=ident
 [probs]
 type=softmax
 inputs=fc10
 [logprob]
 type=cost.logreg
 inputs=labels,probs
--- a/example-layers/layers.gc.cfg
+++ b/example-layers/layers.gc.cfg
@ -0,0 +1,112 @@
 [data]
 type=data
 dataIdx=0
 [labels]
 type=data
 dataIdx=1
 [conv32a]
 type=conv
 inputs=data
 filters=16
 padding=0
 stride=1
 filterSize=3
 channels=3
 neuron=relu
 initW=0.3
 initB=1
 partialSum=1
 sharedBiases=true
 gpu=0
 [conv32b]
 type=conv
 inputs=data
 filters=16
 padding=0
 stride=1
 filterSize=3
 channels=3
 neuron=relu
 initW=0.3
 initB=1
 partialSum=1
 sharedBiases=true
 gpu=1
 [conv32c]
 type=conv
 inputs=data
 filters=16
 padding=0
 stride=1
 filterSize=3
 channels=3
 neuron=relu
 initW=0.3
 initB=1
 partialSum=1
 sharedBiases=true
 gpu=2
 [rnorm1a]
 type=cmrnorm
 inputs=conv32a
 channels=16
 size=5
 [rnorm1b]
 type=cmrnorm
 inputs=conv32b
 channels=16
 size=5
 [rnorm1c]
 type=cmrnorm
 inputs=conv32c
 channels=16
 size=5
 [fc16a]
 type=fc
 outputs=16
 inputs=rnorm1a,rnorm1b,rnorm1c
 initW=0.1,0.1,0.1
 gpu=0
 [fc16b]
 type=fc
 outputs=16
 inputs=rnorm1b,rnorm1c,rnorm1a
 initW=0.1,0.1,0.1
 gpu=1
 [fc16c]
 type=fc
 outputs=16
 inputs=rnorm1c,rnorm1a,rnorm1a
 initW=0.1,0.1,0.1
 gpu=2
 [concat]
 type=concat
 inputs=fc16a,fc16c,fc16b
 [fc10]
 type=fc
 inputs=concat
 outputs=10
 initW=0.08
 gpu=0
 [probs]
 type=softmax
 inputs=fc10
 gpu=0
 [logreg]
 type=cost.logreg
 inputs=labels,probs
 gpu=0
--- a/findsimilar.py
+++ b/findsimilar.py
@ -0,0 +1,78 @@
 import os
 import sys
 from getopt import getopt
 import numpy as n
 import numpy.random as nr
 from time import time
 from util import *
 import pylab as pl
 import gc
 imnet_dir = '/storage2/imnet-contest'
 ftr_dir = '/storage2/imnet-features-4096'
 TEST_IMGS = 128
 TOP_IMGS = 16
 TEST_BATCH = 'data_batch_3000'
 IMG_SIZE = 256
 IMGS_PER_FIGURE = 16
 def draw_fig(test_imgs, tops):
    for f in xrange(TEST_IMGS/IMGS_PER_FIGURE):
        pl.figure(f+1, figsize=(15,15))
        pl.clf()
        bigpic = n.zeros((3, (IMG_SIZE+1)*IMGS_PER_FIGURE - 1, (IMG_SIZE+1)*(1+TOP_IMGS) + 3), dtype=n.single)
        for i in xrange(IMGS_PER_FIGURE):
            img_idx = f * IMGS_PER_FIGURE + i
            bigpic[:, (IMG_SIZE+1) * i:(IMG_SIZE+1)*i+IMG_SIZE,:IMG_SIZE] = test_imgs[:,img_idx].reshape(3, IMG_SIZE, IMG_SIZE)
            for j in xrange(TOP_IMGS):
                if tops[img_idx][j]['img'] is not None:
                    bigpic[:, (IMG_SIZE+1) * i:(IMG_SIZE+1)*i+IMG_SIZE,IMG_SIZE + 4 + j*(IMG_SIZE+1):IMG_SIZE + 4 + j*(IMG_SIZE+1)+IMG_SIZE] = tops[img_idx][j]['img'].reshape(3, IMG_SIZE, IMG_SIZE)
        bigpic /= 255
        pl.imshow(bigpic.swapaxes(0,1).swapaxes(1,2), interpolation='lanczos')
 if __name__ == "__main__":
    (options, args) = getopt(sys.argv[1:], "")
    options = dict(options)
    # Take 128 images from test batch
    dic = unpickle(os.path.join(ftr_dir, TEST_BATCH))
    p = nr.permutation(dic['data'].shape[0])[:TEST_IMGS]
    data = dic['data'][p,:]
    labels = dic['labels'][:,p]
    dicimgs = unpickle(os.path.join(imnet_dir, TEST_BATCH))
    test_imgs = dicimgs['data'][:,p]
    tops = [[{'dist': n.inf, 'batch': 0, 'idx': 0, 'img': None} for i in xrange(TOP_IMGS)] for j in xrange(TEST_IMGS)]
    pl.ion()
    for b in xrange(1, 1335):
        dic = unpickle(os.path.join(ftr_dir, 'data_batch_%d' % b))
        dicimgs = unpickle(os.path.join(imnet_dir, 'data_batch_%d' % b))
        t = time()
        dists = [n.sum((data[i,:] - dic['data'])**2, axis=1) for i in xrange(TEST_IMGS)]
        minidx = [d.argmin() for d in dists]
        print dists[0].shape
        for i, dist, midx, top in zip(xrange(TEST_IMGS), dists, minidx, tops):
            k = TOP_IMGS
            while k > 0 and dist[midx] < top[k - 1]['dist']:
                k -= 1
            if k < TOP_IMGS:
                top.insert(k, {'dist': dist[midx], 'batch': b, 'idx': midx, 'img': dicimgs['data'][:,midx].copy()})
                top.pop()
            #print top
        del dic
        del dicimgs
        del dists
        del minidx
        gc.collect()
        #print tops
        print "Finished training batch %d (%f sec)" % (b, time() - t)
        if b % 50 == 0:
            draw_fig(test_imgs, tops)
            pl.draw()
    pl.ioff()
    draw_fig(test_imgs, tops)
    pl.show()
--- a/fix-big-imgnet.py
+++ b/fix-big-imgnet.py
@ -0,0 +1,40 @@
 import os
 import sys
 from PIL import Image
 from StringIO import StringIO
 from util import *
 src = '/ais/gobi3/u/ilya/jpg_valid_2010_85/'
 dst = '/ais/gobi3/u/kriz/lsvrc-2010-jpg/'
 BATCH_SIZE = 1024
 def save_batch(c_strings, c_labels, c_wnids, out_b):
    pickle(os.path.join(dst, 'data_batch_%d' % out_b), (c_strings, c_labels, c_wnids))
    return out_b + 1
 if __name__ == "__main__":
    c_strings = []
    c_labels = []
    c_wnids = []
    out_b = 2000
    for b in xrange(49):
        failed = 0
        strings, sizes, labels = unpickle(os.path.join(src, '%s' % b))
        for s,l in zip(strings, labels):
            try:
                im = Image.open(StringIO(s)).convert('RGB')
                c_strings += [s]
                c_labels += [l[1]]
                c_wnids += [l[0]]
                if len(c_strings) == BATCH_SIZE:
                    out_b = save_batch(c_strings, c_labels, c_wnids, out_b)
                    c_strings = []
                    c_labels = []
                    c_wnids = []
            except IOError,e:
                failed += 1
        print "Batch %d failed: %d" % (b, failed)
    if len(c_strings) > 0:
        save_batch(c_strings, c_labels, c_wnids, out_b)
--- a/fix-flickr.py
+++ b/fix-flickr.py
@ -0,0 +1,41 @@
 import os
 import sys
 from PIL import Image
 from StringIO import StringIO
 from util import *
 src = '/ais/gobi3/u/ilya/flickr_85/'
 dst = '/ais/gobi3/u/kriz/flickr-85-1024/'
 BATCH_SIZE = 2048
 def save_batch(c_strings, c_sizes, c_labels, out_b):
    pickle(os.path.join(dst, 'data_batch_%d' % out_b), (c_strings, c_sizes, c_labels))
    return out_b + 1
 if __name__ == "__main__":
    c_strings = []
    c_sizes = []
    c_labels = []
    out_b = 1
    for b in xrange(977):
        failed = 0
        strings, sizes, labels = unpickle(os.path.join(src, '%s' % b))
        for s,z,l in zip(strings, sizes, labels):
            try:
                im = Image.open(StringIO(s)).convert('RGB')
                c_strings += [s]
                c_sizes += [z]
                c_labels += [l]
                if len(c_strings) == BATCH_SIZE:
                    out_b = save_batch(c_strings, c_sizes, c_labels, out_b)
                    c_strings = []
                    c_sizes = []
                    c_labels = []
            except IOError,e:
                failed += 1
        print "Batch %d failed: %d" % (b, failed)
    if len(c_strings) > 0:
        save_batch(c_strings, c_sizes, c_labels, out_b)
--- a/gen-py-interface.py
+++ b/gen-py-interface.py
@ -0,0 +1,65 @@
 import sys
 import re
 import os
 MODEL_CONSTRUCTOR = """ConvNet::ConvNet(PyListObject* layerParams, int minibatchSize, int deviceID)"""
 pytype_mappings = {"float": "",
                   "int": "",
                   "bool":"",
                   "PyListObject": "PyList_Type"}
 argstring_mappings = {"float": "d",
                      "bool":"i",
                      "int": "i"}
 init_type_mappings = {"float": "double",
                      "int": "int",
                      "bool":"int",
                      "PyListObject": "PyListObject*"}
 if __name__ == "__main__":
    m = re.match(r"^(\w+)::\w+\((.*)\)$", MODEL_CONSTRUCTOR, re.MULTILINE | re.DOTALL)
    model_name = m.group(1)
    model_params = m.group(2).split(',')
    template = ""
    with open('./pyInterface.cutemp', 'r') as f:
        template = ''.join(line for line in f)
    template = template.replace("${MODEL_NAME}", model_name)
    template = template.replace("${MODEL_NAME_LOWER}", model_name.lower())
    init_vars = ""
    init_parse = ""
    arg_string = ""
    model_preamble = ""
    model_start = "    model = new %s(" % model_name
    space_padding = len(model_start)
    numVectors = 0
    for i,p in enumerate(model_params):
        param = p.strip().split(' ')
        ptype = re.match("^([\w<>\*]+)", param[0]).group(1).strip('*')
        pname = param[1].strip('*')
        pname = "py" + pname[0].upper() + pname[1:]
        if ptype not in pytype_mappings:
            print "Unknown type: %s" % ptype
            sys.exit(1)
        mapping = pytype_mappings[ptype]
        if mapping == "":
            arg_string += argstring_mappings[ptype]
            init_parse += "                          &%s" % pname
        else:
            arg_string += "O!"
            init_parse += "                          &%s, &%s" % (mapping, pname)
        model_start += "%*s%s" % (space_padding * (i>0), "", pname)
        if i < len(model_params) - 1:
            init_parse += ",\n"
            model_start += ",\n"
        init_vars += "    %s %s;\n" % (init_type_mappings[ptype], pname)
    model_start += ");\n"
    template = template.replace("${INIT_VARS}", init_vars)
    template = template.replace("${INIT_PARSE}", init_parse)   
    template = template.replace("${ARG_STRING}", arg_string)   
    template = template.replace("${MODEL_START}", model_preamble + model_start)
    print template
--- a/include/convnet.cuh
+++ b/include/convnet.cuh
@ -0,0 +1,163 @@
 /* 
 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 * 
 * - Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef CONVNET3
 #define	CONVNET3
 #include <vector>
 #include <string>
 #include <set>
 #include <map>
 #include <helper_cuda.h>
 #include <time.h>
 #include <queue.h>
 #include <thread.h>
 #include <math.h>
 #include <sync.h>
 #include <quantizer.cuh>
 #include <messages.cuh>
 #include <pipedispenser.cuh>
 #include "layer.cuh"
 #include "data.cuh"
 #include "worker.cuh"
 #include "weights.cuh"
 #include "hostmem.cuh"
 class Worker;
 class WorkResult;
 class Layer;
 class DataLayer;
 class CostLayer;
 class ConvNetGPU;
 class ConvNet : public Thread {
 protected:
    std::map<std::string,Layer*> _layerMap;
    std::vector<DataLayer*> _dataLayers;
    std::vector<ConvNetGPU*> _convNetThreads; // List of convnet threads
    DataProvider* _dp;
    CPUData* _data;
    ThreadSynchronizer* _sync;
    PipeDispenser* _pd;
    intv* _deviceIDs;
    std::vector<intv*>* _deviceCPUs;
    Queue<Worker*> _workerQueue;
    Queue<WorkResult*> _resultQueue;
    Queue<Message*> _msgQueue;
    int _numFwdTerminal, _numBwdTerminal;
    int _weightUpdateFreq, _numBwdMiniPasses;
    // For gradient checking
    int _numFailures;
    int _numTests;
    // Training progress (between 0 and 1).
    // Used to determine learning rate based on LearningRateSchedule.
    double _trainingProgress;
    double _baseErr;
    void waitForTerminals(int numMsgs, MESSAGES msg);
    void sendMessage(MESSAGES msg, bool sync);
    void findBwdTerminal(Layer& l, std::set<std::string>& visited, std::set<std::string> &terminal);
    void* run();
 public:
    ConvNet(PyObject* layerParams, intv& deviceIDs, std::vector<intv*>& deviceCPUs, int minibatchSize, int weightUpdateFreq);
    Queue<Message*>& getMessageQueue();
    Queue<Worker*>& getWorkerQueue();
    Queue<WorkResult*>& getResultQueue();
    DataProvider& getDataProvider();
    Layer& operator[](string& name);
    Layer& getLayer(string& name);
    void copyToCPU();
    void copyToGPU();
    void updateWeights();
    void reset();
    void bprop(PASS_TYPE passType);
    void fprop(PASS_TYPE passType);
    void fprop(int miniIdx, PASS_TYPE passType);
    void fprop(CPUData& data, PASS_TYPE passType);
    void setTrainingProgress(double progress);
    double getTrainingProgress() const;
    bool checkGradient(const std::string& name, float eps, Weights& weights); 
    void checkGradients();
    Cost& getCost();
    Cost& getCost(Cost& cost);
    double getCostValue();
    int getDeviceID(int gpuIdx);
    intv& getDeviceIDs();
    ThreadSynchronizer& getSync();
    void syncWithChildren();
    int getWeightUpdateFreq();
    int getNumBwdMiniPasses();
    int getMinibatchSize();
    PipeDispenser& getPipeDispenser();
 };
 class ConvNetGPU : public Thread {
 protected:
    std::map<std::string,Layer*> _layerMap;
    std::vector<CostLayer*> _costs;
    ConvNet* _convNet;
    int _deviceID;
    Queue<Message*> _msgQueue;
    void initCuda();
    virtual void initLayer(PyObject* paramsDict);
    void* run();    
    void copyToCPU();
    void copyToGPU();
    void updateWeights();
    void reset();
 public:
    ConvNetGPU(PyObject* layerList, int deviceID, intv& deviceCPUs, ConvNet* convNet);
    std::map<std::string, Layer*>& getLayerMap();
    void bprop(PASS_TYPE passType);
    void fprop(PASS_TYPE passType);
    void fprop(int miniIdx, PASS_TYPE passType);
    int getDeviceID();
    ConvNet& getConvNet();
    void enqueueMessage(Message* msg);
    Queue<Message*>& getMessageQueue();
    std::vector<CostLayer*>& getCostLayers();
    Cost& getCost(int numCases);
    Layer& operator[](string& name);
    Layer& getLayer(string& name);
 };
 #endif	/* CONVNET */
--- a/include/cost.cuh
+++ b/include/cost.cuh
@ -0,0 +1,66 @@
 /* 
 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 * 
 * - Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef COST_CUH
 #define	COST_CUH
 #include <vector>
 #include <map>
 #include <helper_cuda.h>
 #include "layer.cuh"
 #include "util.cuh"
 class CostLayer;
 /*
 * Wrapper for dictionary mapping cost name to vector of returned values.
 */
 class Cost {
 private:
    int _numCases;
    CostMap _costMap;
    CostCoeffMap _costCoeffMap;
 public:
    Cost(int numCases);
    Cost(int numCases, std::vector<CostLayer*>& costs);
    doublev& operator [](const std::string s);
    CostMap& getCostMap();
    CostCoeffMap& getCostCoeffMap();
    int getNumCases();
    /*
     * Returns sum of first values returned by all the costs, weighted by the cost coefficients.
     */
    double getValue();
    Cost& operator += (Cost& er);
    Cost& operator |= (Cost& er);
    Cost& operator /= (const double v);
    virtual ~Cost();
 };
 #endif	/* COST_CUH */
--- a/include/cpuCNN.cuh
+++ b/include/cpuCNN.cuh
@ -0,0 +1,31 @@
 /* 
 * File:   cpuFuncs.h
 * Author: Alex Krizhevsky
 *
 * Created on September 10, 2012, 5:05 PM
 */
 #ifndef CPUFUNCS_H
 #define	CPUFUNCS_H
 #include <helper_cuda.h>
 #include <softmaxtree.cuh>
 /*
 * weights: (numNodes, numFeatures)
 * nodes:   numNodesAtDepth-length array of ushort2 
 *          where x coordinate gives node idx and y coordinate gives parent idx
 * targets: (numNodes, numFeatures)
 * 
 */
 void cpuSoftmaxTreeFwd(float* weights, float* targets, const int numFeatures, SoftmaxTree& tree);
 /*
 * grads:   (numNodes, numFeatures)
 * 
 */
 void cpuSoftmaxTreeBwd(float* grads, const int numFeatures, SoftmaxTree& tree);
 void cpuSoftmaxTreeUpdateWeights(float* weights, float* weightsInc, float* weightsGrad,
                                 const int numFeatures, float eps, const float mom, float wc, SoftmaxTree& tree);
 #endif	/* CPUFUNCS_H */
--- a/include/data.cuh
+++ b/include/data.cuh
@ -0,0 +1,111 @@
 /* 
 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 * 
 * - Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef DATA_CUH
 #define	DATA_CUH
 #include <vector>
 #include <algorithm>
 #include "util.cuh"
 class Data {
 protected:
    MatrixV* _data;
    void assertDimensions() {
        assert(_data->size() > 0);
        for (int i = 1; i < _data->size(); i++) {
            assert(_data->at(i-1)->getNumCols() == _data->at(i)->getNumCols());
            assert(_data->at(i-1)->isTrans() == _data->at(i)->isTrans());
        }
        assert(_data->at(0)->getNumCols() > 0);
    }
 public:
    typedef typename MatrixV::iterator T_iter;
    // Cases in columns, but array may be transposed
    // (so in memory they can really be in rows -- in which case the array is transposed
    //  during the copy to GPU).
    Data(PyObject* pyData) {
        _data = getMatrixV(pyData);
        assertDimensions();
    }
    Data(MatrixV* data) : _data(data) {
        assertDimensions();
    }
    ~Data() {
        for (T_iter it = _data->begin(); it != _data->end(); ++it) {
            delete *it;
        }
        delete _data;
    }
    Matrix& operator [](int idx) const {
        return *_data->at(idx);
    }
    int getSize() const {
        return _data->size();
    }
    MatrixV& getData() const {
        return *_data;
    }
    Matrix& getData(int i) const {
        return *_data->at(i);
    }
    bool isTrans() const {
        return _data->at(0)->isTrans();
    }
    int getNumCases() const {
        return _data->at(0)->getNumCols();
    }
 };
 typedef Data CPUData;
 class DataProvider {
 protected:
    CPUData* _hData;
    NVMatrixV _data;
    int _minibatchSize;
 public:
    DataProvider(int minibatchSize);
    void setData(CPUData&);
    void clearData();
    CPUData& getMinibatch(int idx);
    CPUData& getDataSlice(int startCase, int endCase);
    int getNumMinibatches();
    int getMinibatchSize();
    int getNumCases();
    int getNumCasesInMinibatch(int idx);
 };
 #endif	/* DATA_CUH */
--- a/include/hostmem.cuh
+++ b/include/hostmem.cuh
@ -0,0 +1,51 @@
 /* 
 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 * 
 * - Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef HOSTMEM_CUH
 #define	HOSTMEM_CUH
 #include <helper_cuda.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 /*
 * A utility class for transferring untyped memory from CPU to GPU and vice versa.
 */
 class PinnedHostMem {
 protected:
    uint _numBytes;
    void* _data;
 public:
    PinnedHostMem();
    ~PinnedHostMem();
    void resize(uint bytes);
    void copyFrom(void* src, uint bytes);
    void copyTo(void* dst);
    void* getData();
 };
 #endif	/* HOSTMEM_CUH */
--- a/include/layer.cuh
+++ b/include/layer.cuh
@ -0,0 +1,654 @@
 /* 
 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 * 
 * - Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef LAYER_CUH
 #define	LAYER_CUH
 #include <algorithm>
 #include <string>
 #include <vector>
 #include <map>
 #include <assert.h>
 #include <nvmatrix.cuh>
 #include <multisoftmax.h>
 #include <helper_timer.h>
 #include "convnet.cuh"
 #include "cost.cuh"
 #include "weights.cuh"
 #include "neuron.cuh"
 #include "data.cuh"
 #include "layer_kernels.cuh"
 #include "hostmem.cuh"
 #include "softmaxtree.cuh"
 #include "pipedispenser.cuh"
 class Cost;
 class ConvNet;
 class ConvNetGPU;
 class CostLayer;
 class DataLayer;
 //class Message;
 //class FpropMessage;
 // The input matrix here is the squared norm.
 // This replaces the squared norm with:
 // 1 if it is below the threshold given by norm2
 // norm/sqrt(a) otherwise -- i.e. the desired norm (not squared)
 class WeightConstraintOperator {
 private:
    float _norm, _norm2;
 public:
    WeightConstraintOperator(float norm) : _norm(norm), _norm2(norm*norm) {
    }
    __device__ inline float operator()(const float a) const {
        return a > _norm2 ? __fdividef(_norm, sqrtf(a)) : 1.0f;
    }
 };
 class WeightContrastNormOperator {
 private:
    float _min, _max, _scale;
 public:
    WeightContrastNormOperator(float min, float max, float scale) : _min(min), _max(max), _scale(scale) {
    }
    __device__ inline float operator()(float a) const {
        a = sqrtf(a) * _scale;
        return a < _min ? __fdividef(_min, a) : a > _max ? __fdividef(_max, a) : 1.0f;
    }
 };
 /*
 * Abstract layer.
 */
 class Layer {
 protected:
    ConvNetGPU* _convNetGPU;
    std::vector<Layer*> _prev, _next;
    int _rcvdFInputs;
    std::map<int, int> _rcvdBInputs;
    int _rcvdBInputMsgs;
    int _numOutputs;
    NVMatrixV _inputs;
    std::map<int, NVMatrix*> _outputs;
    std::map<int, NVMatrix*> _actsGrad; // Layer activity gradients
    bool _gradConsumer, _foundGradConsumers, _trans;
    bool _conserveMem;
    bool _bwdTerminal;
    int _numGradProducersNext;
    int _actsTarget, _actsGradTarget;
    std::string _name, _type;
    int _deviceID;
    intv _nextDeviceIDs;
    HostNVMatrix _hostMemFwd, _hostMemBwd;
    Quantizer* _fwdQuantizer, *_bwdQuantizer;
    virtual void fpropNext(PASS_TYPE passType);
    virtual void truncBwdActs(); 
    virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType) = 0;
    virtual void bpropCommon(NVMatrix& v, PASS_TYPE passType) {
        // Do nothing by default
    }
    virtual void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType) {
        assert(!isGradProducer()); // Only do nothing if not grad producer
    }
    void shuffle(intv& v);
 public:
    static bool _saveActsGrad, _saveActs;
    Layer(ConvNetGPU* convNetGPU, PyObject* paramsDict, bool trans);
    virtual void fprop(PASS_TYPE passType);
    void fprop(NVMatrix& v, PASS_TYPE passType);
    virtual void fprop(NVMatrixV& v, PASS_TYPE passType);
    virtual void bprop(PASS_TYPE passType);
    virtual void bprop(NVMatrix& v, PASS_TYPE passType);
    virtual void reset();
    int getNumCases(NVMatrix& v);
    int incRcvdBInputs(int deviceID);
    int getRcvdFInputs();
    int getRcvdBInputs(int deviceID);
    int incRcvdBInputMsgs();
    bool isGradConsumer();
    bool hasGradProducerNext(std::string& layerName);
    // Does this layer produce a gradient for any layer?
    virtual bool isGradProducer();
    // Does this layer produce a gradient for layer of given name?
    virtual bool isGradProducer(std::string& layerName);
    std::string& getName();
    std::string& getType();
    void addNext(Layer* l);
    void addPrev(Layer* l);
    std::vector<Layer*>& getPrev();
    std::vector<Layer*>& getNext();
    virtual NVMatrix& getActs();
    virtual NVMatrix& getActs(int deviceID);
    virtual NVMatrix& getActsGrad(int deviceID);
    virtual NVMatrix& getActsGrad();
    virtual void postInit();
    int getDeviceID();
    ConvNetGPU& getConvNetGPU();
    ConvNet& getConvNet();
    PipeDispenser& getPipeDispenser();
    void setBwdTerminal(bool t);
    // Do nothing if this layer has no weights
    virtual bool updateWeights() {
        return false;
    }
    virtual void checkGradients() {
    }
    virtual void copyToCPU() {
    }
    virtual void copyToGPU()  {
    }
 };
 class NeuronLayer : public Layer {
 protected:
    Neuron* _neuron;
    string _neuronType;
    virtual void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    virtual void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    NeuronLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
    std::string& getNeuronType();
 };
 class WeightLayer : public Layer {
 protected:
    WeightList _weights;
    Weights *_biases;
    float _wStep, _bStep;
    bool _gradComputed;
    void bpropCommon(NVMatrix& v, PASS_TYPE passType);
    virtual void bpropBiases(NVMatrix& v, PASS_TYPE passType) = 0;
    virtual void bpropWeights(NVMatrix& v, int inpIdx, PASS_TYPE passType) = 0;
    virtual void constrainWeights() = 0;
 public:
    WeightLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict, bool trans, bool useGrad, bool initWeights);
    virtual bool updateWeights();
    virtual void copyToCPU();
    virtual void copyToGPU();
    virtual void checkGradients();
    Weights& getWeights(int idx);
 };
 class FCLayer : public WeightLayer {
 protected:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropBiases(NVMatrix& v, PASS_TYPE passType);
    void bpropWeights(NVMatrix& v, int inpIdx, PASS_TYPE passType);
    virtual void constrainWeights();
 public:
    FCLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict, bool useGrad, bool initWeights);
    FCLayer();
 };
 class TreeFCLayer : public FCLayer {
 protected:
    TreeWeights* _treeWeights;
    static void makeTree(PyObject* pyTree, SoftmaxNode& rootNode);
    void constrainWeights();
 public:
    TreeFCLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
    void checkGradients();
 };
 class SoftmaxLayer : public Layer {
 protected:
    bool _doLogregGrad;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    SoftmaxLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
    void setDoLogregGrad(bool b);
 };
 class ConcatenationLayer : public Layer {
 protected:
    intv* _copyOffsets;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    ConcatenationLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
    void setDoLogregGrad(bool b);
 };
 class EltwiseSumLayer : public Layer {
 protected:
    floatv* _coeffs;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    EltwiseSumLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 class EltwiseMaxLayer : public Layer {
 protected:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    EltwiseMaxLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 class DataLayer : public Layer {
 protected:
    bool _useBuffer;
    int _dataIdx;
    int _bufferMinibatchIdx;
    std::map<int, NVMatrix*> _outputs2; // Buffer for copying data during computation
    CPUData* _bufferData;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void postInit();
    void copyData(CPUData& data, bool other);
    void fpropNext(PASS_TYPE passType);
 public:
    DataLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
    NVMatrix& getActs(int deviceID);
    NVMatrix& getActs(int deviceID, bool other);
    bool isGradProducer();
    void fprop(PASS_TYPE passType);
    void fprop(NVMatrixV& data, PASS_TYPE passType);
    void setBuffer(CPUData& data, int minibatchIdx);
    void startFprop(CPUData& data, PASS_TYPE passType);
    void startFpropFromBuffer(PASS_TYPE passType);
    int getBufferMinibatchIdx();
    CPUData* getBufferData();
 };
 class LocalLayer : public WeightLayer {
 protected:
    struct FilterConns {
        int* hFilterConns;
        int* dFilterConns;
    };
    vector<FilterConns>* _filterConns;
    intv* _padding, *_stride, *_filterSize, *_channels, *_imgSize, *_groups;
    intv* _imgPixels, *_filterPixels, *_filterChannels, *_overSample, *_randSparse;
    int _modulesX, _modules, _numFilters;
    void copyToGPU();
 public:
    LocalLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict, bool useGrad);
 };
 class ConvLayer : public LocalLayer {
 protected:
    int _partialSum;
    bool _sharedBiases;
    floatv* _weightContrastNormMin, *_weightContrastNormMax;
    NVMatrix _weightGradTmp, _actGradTmp;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropBiases(NVMatrix& v, PASS_TYPE passType);
    void bpropWeights(NVMatrix& v, int inpIdx, PASS_TYPE passType);
    void truncBwdActs();
    void constrainWeights();
 public:
    ConvLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 }; 
 class LocalUnsharedLayer : public LocalLayer {
 protected:
    NVMatrix _sexMask;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropBiases(NVMatrix& v, PASS_TYPE passType);
    void bpropWeights(NVMatrix& v, int inpIdx, PASS_TYPE passType);
    void constrainWeights();
 public:
    LocalUnsharedLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 }; 
 class PoolLayer : public Layer {
 protected:
    int _channels, _sizeX, _start, _stride, _outputsX;
    int _imgSize;
    string _pool;
 public:
    PoolLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict, bool trans);
    static PoolLayer& makePoolLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 }; 
 class AvgPoolLayer : public PoolLayer {
 protected:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    AvgPoolLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 }; 
 class MaxPoolLayer : public PoolLayer {
 protected:
    bool _abs;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    MaxPoolLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict, bool abs);
 };
 class RandomPoolLayer : public PoolLayer {
 protected:
    bool _doMax;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    RandomPoolLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 class RandomScaleLayer : public Layer {
 protected:
    int _channels, _imgSize, _tgtSize, _minScaledSize;
    float _maxScale; // should be >= 1
    NVMatrix _rescaledActs;
    std::vector<double> _scaleProbs;
 public:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
    RandomScaleLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 class NailbedLayer : public Layer {
 protected:
    int _channels, _start, _stride, _outputsX;
    int _imgSize;
 public:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
    NailbedLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 class GaussianBlurLayer : public Layer {
 protected:
    int _channels;
    Matrix* _hFilter;
    NVMatrix _filter;
    NVMatrix _actGradsTmp;
 public:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
    void copyToGPU();
    GaussianBlurLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 class HorizontalReflectionLayer : public Layer {
 protected:
    int _channels, _imgSize;
 public:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
    HorizontalReflectionLayer(ConvNetGPU* convNet, PyObject* paramsDict);
 };
 class ResizeLayer : public Layer {
 protected:
    int _channels;
    float _scale;
    int _imgSize, _tgtSize;
 public:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
    ResizeLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 class HiddenSexLayer : public Layer {
 protected:
    bool _enable;
    float _keep;
    NVMatrix _sexMask;
 public:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
    void truncBwdActs();
    HiddenSexLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 class RGBToYUVLayer : public Layer {
 public:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
    RGBToYUVLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 class RGBToLABLayer : public Layer {
 protected:
    bool _center;
 public:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
    RGBToLABLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 class ResponseNormLayer : public Layer {
 protected:
    int _channels, _size;
    float _scale, _pow;
    NVMatrix _denoms;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
    void truncBwdActs();
 public:
    ResponseNormLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 }; 
 class CrossMapResponseNormLayer : public ResponseNormLayer {
 protected:
    bool _blocked;
    float _minDiv;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    CrossMapResponseNormLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 }; 
 class ContrastNormLayer : public ResponseNormLayer {
 protected:
    int _imgSize;
    NVMatrix _meanDiffs;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
    void truncBwdActs();
 public:
    ContrastNormLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 class CostLayer : public Layer {
 protected:
    float _coeff;
    doublev _costv;
 public:
    CostLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict, bool trans);
    void bprop(NVMatrix& v, PASS_TYPE passType);
 //    void bprop(PASS_TYPE passType); // Pure idiocy... it won't compile without this useless definition.
    void fprop(PASS_TYPE passType); 
    virtual doublev& getCost();
    float getCoeff();
    bool isGradProducer();
    void setSendTerminalMessages(bool send);
    static CostLayer& makeCostLayer(ConvNetGPU* convNetGPU, string& type, PyObject* paramsDict);
 };
 /*
 * Input 0: labels
 * Input 1: softmax outputs
 */
 class CrossEntCostLayer : public CostLayer {
 protected:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    CrossEntCostLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 /*
 * Input 0: labels
 * Input 1: softmax outputs
 */
 class LogregCostLayer : public CostLayer {
 protected:
    NVMatrix _correctProbs, _topkProbs;
    NVMatrix _probsAccum;
    int _numAccumed;
    int _topk;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    LogregCostLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
    NVMatrix& getProbsAccum();
 };
 /*
 * Input 0: labels
 * Input 1: logistic outputs
 */
 class CrossEnt2CostLayer : public CostLayer {
 protected:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    CrossEnt2CostLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
    class CrossEntOperator {
    public:
        __device__ inline float operator()(const float t, const float y) const {
            return t * safelog(y) + (1.0f - t) * safelog(1.0f - y);
        }
    };
    // Only for use with non-logistic units
    class CrossEntGradientOperator {
    private:
        float _coeff;
    public:
        CrossEntGradientOperator(float coeff) : _coeff(coeff) {
        }
        __device__ inline float operator()(const float t, const float y) const {
            return _coeff * (__fdividef(t, y) + __fdividef(1.0f - t, 1.0f - y));
        }
    };
 };
 /*
 * Input 0: labels
 * Input 1: logistic outputs
 */
 class RobustFlickrCost : public CostLayer {
 protected:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    RobustFlickrCost(ConvNetGPU* convNetGPU, PyObject* paramsDict);
    class RobustFlickrCostOperator {
    public:
        __device__ inline float operator()(const float t, const float y) const {
            const float d = (y-t) * (y-t);
            return __logf(1 + d);// - (t * safelog(y));
        }
    };
    // Only for use with non-logistic units
    class RobustFlickrCostGradientOperator {
    private:
        float _coeff;
    public:
        RobustFlickrCostGradientOperator(float coeff) : _coeff(coeff) {
        }
        __device__ inline float operator()(const float t, const float y) const {
            const float d = y - t;
            return -_coeff * (__fdividef(2.0f * d, 1.0f + d*d) /*- __fdividef(t, y)*/);
        }
    };
 };
 class SumOfSquaresCostLayer : public CostLayer {
 protected:
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    SumOfSquaresCostLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 /*
 * Input 0: labels
 * Input 1: energies
 */
 class MultiSoftmaxCostLayer : public CostLayer {
 protected:
    NVMatrix _probsT;
    Matrix _cpuProbs, _cpuLabels, _energies_T_CPU;
    std::vector<Matrix*> B;
    int _setSize, _numOut, _threads;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    MultiSoftmaxCostLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
    void computeCost(bool useEnergies);
 };
 /*
 * input 0: gates
 * input 1: what to sum and square
 */
 class GatedSumOfSquaresCostLayer : public CostLayer {
 protected:
    NVMatrix _ungated;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    GatedSumOfSquaresCostLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 class TICACostLayer : public CostLayer {
 protected:
    int _sizeX, _channels;
    void fpropActs(int inpIdx, float scaleTargets, PASS_TYPE passType);
    void bpropActs(NVMatrix& v, int inpIdx, float scaleTargets, PASS_TYPE passType);
 public:
    TICACostLayer(ConvNetGPU* convNetGPU, PyObject* paramsDict);
 };
 #endif	/* LAYER_CUH */
--- a/include/layer_kernels.cuh
+++ b/include/layer_kernels.cuh
@ -0,0 +1,65 @@
 /* 
 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 * 
 * - Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef LAYER_KERNELS_CUH
 #define	LAYER_KERNELS_CUH
 #include <vector>
 #include <helper_cuda.h>
 #include <nvmatrix.cuh>
 #define LOGREG_GRAD_THREADS_X      32
 #define LOGREG_GRAD_THREADS_Y      4
 #define LOGREG_ERR_THREADS_X        128
 #define LOGREG_ERR_THREADS_Y        1
 __device__ inline float safelog(const float x) {
    return x > 0.0f ? __logf(x) : -50.0f;
 }
 void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out);
 void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
 void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, bool add);
 void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out);
 void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
 // Numerical stability optimization: this routine combines computeLogregGrad with computeSoftmaxGrad
 // to avoi dividing and then multiplying by quantities that may be near zero.
 void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
 void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff);
 void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add);
 void MSMBackward(NVMatrix& energies, NVMatrix& bLattice, int setSize);
 void MultiSoftmaxCPU(Matrix& elts, Matrix& B, Matrix& probs, int size, int fixed);
 void MultiSoftmaxCPU_T(Matrix& elts, Matrix& B, Matrix& probs, Matrix& fixed, int size);
 void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& energies, NVMatrix& labelLogProbs_out,
                       NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize, bool useEnergies);
 #endif	/* LAYER_KERNELS_CUH */
--- a/include/lr.cuh
+++ b/include/lr.cuh
@ -0,0 +1,77 @@
 #ifndef LR_CUH
 #define	LR_CUH
 #include <string>
 #include <vector>
 #include <iostream>
 #include <helper_cuda.h>
 #include <assert.h>
 #include <nvmatrix.cuh>
 #include <matrix.h>
 #include <util.cuh>
 #include <Python.h>
 /*
 * The maximum learning rate is _baseRate.
 * The minimum learning rate is _baseRate / _tgtFactor.
 *
 * These classes define annealing schedules that interpolate between these
 * two extrema.
 */
 class LearningRateSchedule {
 protected:
 	double _baseRate, _noiseStdev, _randnSpare;
 	bool _haveRandnSpare;
 	virtual double _getRate(double progress);
 	double randn();
 	double rand() const;
 	double abs(double x) const;
 public:
 	LearningRateSchedule(double base);
 	LearningRateSchedule(double base, double noiseStdev);
 	double getRate(double progress);
 	double getBaseRate() const;
 	virtual ~LearningRateSchedule();
 	static LearningRateSchedule& make(PyObject* lrsDict, double base);
 };
 class LinearLRS : public LearningRateSchedule {
 protected:
 	double _finalRate;
 public:
 	LinearLRS(double base, double tgtFactor, double noiseStdev);
 	virtual double _getRate(double progress);
 };
 class ExpLRS : public LearningRateSchedule {
 protected:
 	double _pow;
 public:
 	ExpLRS(double baseRate, double tgtFactor, double noiseStdev);
 	virtual double _getRate(double progress);
 };
 class TanhLRS : public LearningRateSchedule {
 protected:
 	double _alpha, _beta;
 public:
 	TanhLRS(double baseRate, double tgtFactor, double noiseStdev);
 	virtual double _getRate(double progress);
 };
 class DiscreteExpLRS : public LearningRateSchedule {
 protected:
 	std::vector<double> _rates;
 public:
 	DiscreteExpLRS(double baseRate, double tgtFactor, double noiseStdev, int numSteps);
 	virtual double _getRate(double progress);
 };
 class JumpyDiscreteExpLRS : public DiscreteExpLRS {
 public:
 	JumpyDiscreteExpLRS(double baseRate, double tgtFactor, double noiseStdev, int numSteps);
 	virtual double _getRate(double progress);
 };
 #endif	/* LR_CUH */
--- a/include/messages.cuh
+++ b/include/messages.cuh
@ -0,0 +1,133 @@
 /*
 * messages.cuh
 *
 *  Created on: 2013-02-25
 *      Author: spoon
 */
 #ifndef MESSAGES_CUH_
 #define MESSAGES_CUH_
 #include <string>
 enum MESSAGES { FPROP_TERMINAL,
                BPROP_TERMINAL,
                BPROP_READY,
                FPROP_READY,
                SYNC,
                COPY_TO_CPU,
                COPY_TO_GPU,
                UPDATE_WEIGHTS,
                RESET,
                COST_COMPUTED,
                BPROP_START,
 //                COPY,
 //                DEQUANTIZE,
                RUNME};
 class Message {
 protected:
    MESSAGES _messageType;
 public:
    MESSAGES getMessageType() {
        return _messageType;
    }
    Message(MESSAGES messageType) : _messageType(messageType) {
    }
    virtual ~Message() {
    }
 };
 /*
 * A message that performs some simple function in its run method.
 */
 class RunMeMessage : public Message {
 public:
    RunMeMessage() : Message(RUNME) {
    }
    virtual void run() = 0;
    virtual ~RunMeMessage() {
    }
 };
 class CopyMessage : public RunMeMessage {
 protected:
    NVMatrix* _src, *_tgt;
 public:
    CopyMessage(NVMatrix* src, NVMatrix* tgt) : _src(src), _tgt(tgt), RunMeMessage() {
    }
    void run() {
        _src->copy(*_tgt);
    }
    ~CopyMessage() {
        assert(_src->isView());
        delete _src;
    }
 };
 class DequantizeMessage : public RunMeMessage {
 protected:
    Quantizer* _q;
    NVMatrix *_tgt;
 public:
    DequantizeMessage(Quantizer* q, NVMatrix* tgt) : _q(q), _tgt(tgt), RunMeMessage()  {
    }
    void run() {
        _q->dequantize(*_tgt);
    }
    ~DequantizeMessage() {
    }
 };
 class PropMessage : public Message {
 protected:
    std::string _fromLayer, _toLayer;
    PASS_TYPE _passType;
 public:
    std::string& getFromLayer() {
        return _fromLayer;
    }
    std::string& getToLayer() {
        return _toLayer;
    }
    PASS_TYPE getPassType() {
        return _passType;
    }
    PropMessage(std::string fromLayer, std::string toLayer, PASS_TYPE passType, MESSAGES msgType)
        : _fromLayer(fromLayer), _toLayer(toLayer), _passType(passType), Message(msgType) {
    }
 };
 class FpropMessage : public PropMessage {
 public:
    FpropMessage(std::string fromLayer, std::string toLayer, PASS_TYPE passType)
        : PropMessage(fromLayer, toLayer, passType, FPROP_READY) {
    }
 };
 class BpropMessage : public PropMessage {
 public:
    BpropMessage(std::string fromLayer, std::string toLayer, PASS_TYPE passType)
        : PropMessage(fromLayer, toLayer, passType, BPROP_READY) {
    }
 };
 class BpropStartMessage : public Message {
 protected:
    PASS_TYPE _passType;
 public:
    PASS_TYPE getPassType() {
        return _passType;
    }
    BpropStartMessage(PASS_TYPE passType)
        : _passType(passType), Message(BPROP_START) {
    }
 };
 #endif /* MESSAGES_CUH_ */
--- a/include/multisoftmax.h
+++ b/include/multisoftmax.h
@ -0,0 +1,38 @@
 /* 
 * File:   multisoftmax.h
 * Author: Alex Krizhevsky
 *
 * Created on May 9, 2012, 5:36 PM
 */
 #ifndef MULTISOFTMAX_H
 #define	MULTISOFTMAX_H
 #include <algorithm>
 #include <thread.h>
 #include <matrix.h>
 #include <vector>
 #ifndef DIVUP
 #define DIVUP(x, y) (((x) + (y) - 1) / (y))
 #endif
 #define EXP exp
 #define LOG log
 #define INF 1e35f
 class MultiSoftmaxWorker : public Thread {
 protected:
    Matrix* _elts, *_B, *_probs, *_fixed;
    int _size;
    bool _nofix;
    void* run();
 public:
    MultiSoftmaxWorker(Matrix* elts, Matrix* B, Matrix* probs, Matrix* _fixed, int size, bool nofix);
    virtual ~MultiSoftmaxWorker();
 };
 void MultiSoftmaxCPU_T_parallel(Matrix& elts, std::vector<Matrix*>& B, Matrix& probs, Matrix& fixed, int size, bool nofix);
 #endif	/* MULTISOFTMAX_H */
--- a/include/neuron.cuh
+++ b/include/neuron.cuh
@ -0,0 +1,529 @@
 /* 
 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 * 
 * - Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef NEURONS_CUH
 #define	NEURONS_CUH
 #include <assert.h>
 #include <string>
 #include <nvmatrix.cuh>
 #include <helper_cuda.h>
 template <class GradientOp>
 class AddGradientBinaryOperator {
    GradientOp _op;
 public:
    AddGradientBinaryOperator(GradientOp op) : _op(op) {
    }
    __device__ inline float operator()(const float unitActGrad, const float unitAct, const float target) const {
        return _op(unitActGrad, unitAct) + target; 
    }
 };
 template <class GradientOp>
 class AddGradientOperator {
    GradientOp _op;
 public:
    AddGradientOperator(GradientOp op) : _op(op) {
    }
    __device__ inline float operator()(const float unitActGrad, const float target) const {
        return target + _op(unitActGrad); 
    }
 };
 /* =======================
 * Neuron
 * -----------------------
 * 
 * f(x) = x
 * =======================
 */
 class Neuron {
 protected:
    bool _activated;
    // Inputs and outputs potentially point to the same matrix, depending on the neuron
    NVMatrix* _inputs, *_outputs; 
    virtual void _activate() {
        if (_inputs != _outputs) {
            _inputs->copy(*_outputs);
        }
    }
    virtual void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        if (&target != &actsGrad) {
            actsGrad.copy(target);
        }
    }
    virtual void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        if (&target != &actsGrad) {
            target.add(actsGrad);
        }
    }
 public:
    Neuron() : _activated(false), _inputs(NULL), _outputs(NULL) {
    }
    virtual void activate(NVMatrix& inputs, NVMatrix& outputs) {
        _activated = true;
        _inputs = &inputs;
        _outputs = &outputs;
        _activate();
    }
    virtual void computeInputGrad(NVMatrix& actsGrad, NVMatrix& target, bool add) {
        assert(_activated);
        if (!add) {
            target.resize(actsGrad);
            _computeInputGrad(actsGrad, target);
        } else {
            _addInputGrad(actsGrad, target);
        }
    }
    static Neuron& makeNeuron(PyObject* neuronDict);
 };
 /* =======================
 * LogisticNeuron
 * -----------------------
 * 
 * f(x) = 1 / (1 + e^-x)
 * =======================
 */
 class LogisticNeuron : public Neuron {
 protected:
    void _activate() {
        _inputs->apply(NVMatrixOps::Logistic(), *_outputs);
    }
    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyBinary(LogisticGradientOperator(), *_outputs, target);
    }
    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyTernary(AddGradientBinaryOperator<LogisticGradientOperator>(LogisticGradientOperator()), *_outputs, target, target);
    }
 public:
    class LogisticGradientOperator {
    public:
        __device__ inline float operator()(float unitActGrad, float unitAct) const {
            return unitActGrad * unitAct * (1.0f - unitAct); 
        }
    };
    LogisticNeuron() : Neuron() {
    }
 };
 /* =======================
 * ReluNeuron
 * -----------------------
 * 
 * f(x) = max(0, x)
 * =======================
 */
 class ReluNeuron : public Neuron {
 protected:
    virtual void _activate() {
        _inputs->apply(ReluOperator(), *_outputs);
    }
    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyBinary(ReluGradientOperator(), *_outputs, target);
    }
    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyTernary(AddGradientBinaryOperator<ReluGradientOperator>(ReluGradientOperator()), *_outputs, target, target);
    }
 public:
    class ReluOperator {
    public:    
        __device__ inline float operator()(float x) const {
            return x < 0.0f ? 0.0f : x;
        }
    };
    class ReluGradientOperator {
    public:
        __device__ inline float operator()(float unitActGrad, float unitAct) const  {
            return unitActGrad * (unitAct > 0.0f); 
        }
    };
    ReluNeuron() : Neuron() {
    }
 };
 /* =======================
 * NoisyReluNeuron
 * -----------------------
 * 
 * f(x) = max(0, max(0, x) + gaussian noise with variance equal to max(0, x))
 * =======================
 */
 class NoisyReluNeuron : public ReluNeuron {
 protected:
    void _activate() {
        ReluNeuron::_activate();
        _outputs->addGaussianNoise(*_outputs, false);
        _outputs->apply(ReluOperator());
    }
 public:
    NoisyReluNeuron() : ReluNeuron() {
    }
 };
 /* =======================
 * BoundedReluNeuron
 * -----------------------
 * 
 * f(x) = min(a, max(0, x))
 * =======================
 */
 class BoundedReluNeuron : public Neuron {
 protected:
    float _a;
    void _activate() {
        _inputs->apply(BoundedReluOperator(_a), *_outputs);
    }
    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyBinary(BoundedReluGradientOperator(_a), *_outputs, target);
    }
    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyTernary(AddGradientBinaryOperator<BoundedReluGradientOperator>(BoundedReluGradientOperator(_a)), *_outputs, target, target);
    }
 public:
    class BoundedReluOperator {
    private:
        float _a;
    public:
        BoundedReluOperator(float a) : _a(a) {
        }
        __device__ inline float operator()(float x) const {
            return x < 0.0f ? 0.0f : x > _a ? _a : x;
        }
    };
    class BoundedReluGradientOperator {
    private:
        float _a;
    public:
        BoundedReluGradientOperator(float a) : _a(a) {
        }
        __device__ inline float operator()(float unitActGrad, float unitAct) const  {
            return unitActGrad * (unitAct > 0.0f) * (unitAct < _a); 
        }
    };
    BoundedReluNeuron(float a) : Neuron(), _a(a) {
    }
 };
 /* =======================
 * AbsNeuron
 * -----------------------
 * 
 * f(x) = abs(x)
 * =======================
 */
 class AbsNeuron : public Neuron {
 protected:
    void _activate() {
        assert(_inputs != _outputs);
        _inputs->apply(NVMatrixOps::Abs(), *_outputs);
    }
    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyBinary(AbsGradientOperator(), *_inputs, target);
    }
    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyTernary(AddGradientBinaryOperator<AbsGradientOperator>(AbsGradientOperator()), *_inputs, target, target);
    }
 public:
    class AbsGradientOperator {
    public:
        __device__ inline float operator()(float unitActGrad, float unitInput) const  {
            return unitActGrad * (unitInput > 0.0f ? 1.0f : -1.0f); 
        }
    };
    AbsNeuron() : Neuron() {
    }
 };
 /* =======================
 * TanhNeuron
 * -----------------------
 * 
 * f(x) = a*tanh(b*x)
 * =======================
 */
 class TanhNeuron : public Neuron {
 protected:
    float _a, _b;
    void _activate() {
        _inputs->apply(TanhOperator(_a, _b), *_outputs);
    }
    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyBinary(TanhGradientOperator(_a, _b), *_outputs, target);
    }
    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyTernary(AddGradientBinaryOperator<TanhGradientOperator>(TanhGradientOperator(_a, _b)), *_outputs, target, target);
    }
 public:
    class TanhOperator {
    private:
        float _a, _n2b;
    public:
        TanhOperator(float a, float b) : _a(a), _n2b(-2*b) {
        }
        virtual __device__ inline float operator()(float x) const {
            return _a * (__fdividef(2.0f, 1.0f + __expf(x * _n2b)) - 1.0f);
        }
    };
    class TanhGradientOperator {
    private:
        float _b, _a;
    public:
        TanhGradientOperator(float a, float b) : _b(b), _a(a) {
        }
        __device__ inline float operator()(float unitActGrad, float unitAct) const  {
 //            const float t = (1.0f - __fdividef(unitAct, _a)) / 2.0f;
 //            return unitActGrad * _n4ab * (t * (t - 1.0f));
            return unitActGrad * _b * (_a - __fdividef(unitAct * unitAct, _a));
        }
    };
    TanhNeuron(float a, float b) : Neuron(), _a(a), _b(b) {
    }
 };
 /* =======================
 * DoubleReluNeuron
 * -----------------------
 * 
 * f(x) = x - a*tanh(x/a)
 * =======================
 */
 class DoubleReluNeuron : public Neuron {
 protected:
    float _a;
    void _activate() {
        assert(_inputs != _outputs);
        _inputs->apply(DoubleReluOperator(_a), *_outputs);
    }
    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyBinary(DoubleReluGradientOperator(_a), *_inputs, target);
    }
    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyTernary(AddGradientBinaryOperator<DoubleReluGradientOperator>(DoubleReluGradientOperator(_a)), *_inputs, target, target);
    }
 public:
    class DoubleReluOperator {
    private:
        float _a, _n2a;
    public:
        DoubleReluOperator(float a) : _a(a), _n2a(-2.0f / a) {
        }
        virtual __device__ inline float operator()(float x) const {
            return x - _a * (__fdividef(2.0f, 1.0f + __expf(_n2a * x)) - 1.0f);
        }
    };
    class DoubleReluGradientOperator {
    private:
        float _n2a;
    public:
        DoubleReluGradientOperator(float a) : _n2a(-2.0f / a) {
        }
        __device__ inline float operator()(float unitActGrad, float unitInput) const  {
            const float tanh = __fdividef(2.0f, 1.0f + __expf(_n2a * unitInput)) - 1.0f;
            return unitActGrad * (tanh*tanh);
        }
    };
    DoubleReluNeuron(float a) : Neuron(), _a(a) {
    }
 };
 /* =======================
 * SoftReluNeuron
 * -----------------------
 * 
 * f(x) = log(1 + e^x)
 * =======================
 */
 class SoftReluNeuron : public Neuron {
 protected:
    void _activate() {
        assert(_inputs != _outputs);
        _inputs->apply(SoftReluOperator(), *_outputs);
    }
    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyBinary(SoftReluGradientOperator(), *_inputs, target);
    }
    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyTernary(AddGradientBinaryOperator<SoftReluGradientOperator>(SoftReluGradientOperator()), *_inputs, target, target);
    }
 public:
    class SoftReluOperator {
    public:    
        __device__ inline float operator()(float x) const {
            // This piece-wise implementation has better numerical stability than
            // simply computing log(1 + e^x).
            return x > 4.0f ? x : __logf(1.0f + __expf(x));
        }
    };
    class SoftReluGradientOperator {
    public:
        __device__ inline float operator()(float unitActGrad, float unitInput) const  {
            if (unitInput > 4.0f) {
                return unitActGrad;
            }
            const float f = __expf(unitInput);
            return unitActGrad * __fdividef(f, 1.0f + f); 
        }
    };
    SoftReluNeuron() : Neuron() {
    }
 };
 /* =======================
 * SquareNeuron
 * -----------------------
 * 
 * f(x) = x^2
 * =======================
 */
 class SquareNeuron : public Neuron {
 protected:
    void _activate() {
        assert(_inputs != _outputs);
        _inputs->apply(NVMatrixOps::Square(), *_outputs);
    }
    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyBinary(SquareGradientOperator(), *_inputs, target);
    }
    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyTernary(AddGradientBinaryOperator<SquareGradientOperator>(SquareGradientOperator()), *_inputs, target, target);
    }
 public:
    class SquareGradientOperator {
    public:
        __device__ inline float operator()(float unitActGrad, float unitInput) const {
            return unitActGrad * 2.0f * unitInput; 
        }
    };
    SquareNeuron() : Neuron() {
    }
 };
 /* =======================
 * SqrtNeuron
 * -----------------------
 * 
 * f(x) = sqrt(x)
 * =======================
 */
 class SqrtNeuron : public Neuron {
 protected:
    void _activate() {
        _inputs->apply(NVMatrixOps::Sqrt(), *_outputs);
    }
    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyBinary(SqrtGradientOperator(), *_outputs, target);
    }
    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyTernary(AddGradientBinaryOperator<SqrtGradientOperator>(SqrtGradientOperator()), *_outputs, target, target);
    }
 public:
    class SqrtGradientOperator {
    public:
        __device__ inline float operator()(float unitActGrad, float unitAct) const {
            return __fdividef(unitActGrad, 2.0f * unitAct); 
        }
    };
    SqrtNeuron() : Neuron() {
    }
 };
 /* =======================
 * LinearNeuron
 * -----------------------
 * 
 * f(x) = a*x + b
 * =======================
 */
 class LinearNeuron : public Neuron {
 protected:
    float _a, _b;
    void _activate() {
        _inputs->apply(LinearOperator(_a, _b), *_outputs);
    }
    void _computeInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.scale(_a, target);
    }
    void _addInputGrad(NVMatrix& actsGrad, NVMatrix& target) {
        actsGrad.applyBinary(AddGradientOperator<NVMatrixOps::MultByScalar>(NVMatrixOps::MultByScalar(_a)), target, target);
    }
 public:
    class LinearOperator {
    protected:
        float _a, _b;
    public:    
        __device__ inline float operator()(float x) const {
            return _a * x + _b;
        }
        LinearOperator(float a, float b) : _a(a), _b(b) {
        }
    };
    LinearNeuron(float a, float b) : Neuron(), _a(a), _b(b) {
    }
 };
 #endif	/* NEURONS_CUH */
--- a/include/pipedispenser.cuh
+++ b/include/pipedispenser.cuh
@ -0,0 +1,139 @@
 /*
 * pipedispenser.cuh
 *
 *  Created on: 2013-03-01
 *      Author: spoon
 */
 #ifndef PIPEDISPENSER_CUH_
 #define PIPEDISPENSER_CUH_
 #include <pthread.h>
 #include <set>
 #include <algorithm>
 #include <iterator>
 #include <util.cuh>
 class PipeDispenser {
 protected:
    int _numPipes;
    seti _pipes;
    pthread_mutex_t *_mutex;
    void lock() {
        pthread_mutex_lock(_mutex);
    }
    void unlock() {
        pthread_mutex_unlock(_mutex);
    }
 public:
    PipeDispenser(const seti& pipes) {
        _pipes.insert(pipes.begin(), pipes.end());
        _mutex = (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
        pthread_mutex_init(_mutex, NULL);
    }
    virtual ~PipeDispenser() {
        pthread_mutex_destroy(_mutex);
        free(_mutex);
    }
    virtual int getPipe(const seti& interested) = 0;
    int getPipe(int interested) {
        seti tmp;
        tmp.insert(interested);
        return getPipe(tmp);
    }
    virtual void freePipe(int pipe) = 0;
 };
 /*
 * This one blocks until there is a free pipe to return.
 */
 class PipeDispenserBlocking : public PipeDispenser {
 protected:
    pthread_cond_t *_cv;
    void wait() {
        pthread_cond_wait(_cv, _mutex);
    }
    void broadcast() {
        pthread_cond_broadcast(_cv);
    }
    int getAvailablePipes(const seti& interested, intv& available) {
        available.clear();
        std::set_intersection(_pipes.begin(), _pipes.end(), interested.begin(), interested.end(), std::back_inserter(available));
        return available.size();
    }
 public:
    PipeDispenserBlocking(const seti& pipes) : PipeDispenser(pipes) {
        _cv = (pthread_cond_t*)(malloc(sizeof (pthread_cond_t)));
        pthread_cond_init(_cv, NULL);
    }
    ~PipeDispenserBlocking() {
        pthread_cond_destroy(_cv);
        free(_cv);
    }
    int getPipe(const seti& interested) {
        lock();
        intv avail;
        while (getAvailablePipes(interested, avail) == 0) {
            wait();
        }
        int pipe = avail[0];
        _pipes.erase(pipe);
        unlock();
        return pipe;
    }
    void freePipe(int pipe) {
        lock();
        _pipes.insert(pipe);
        broadcast();
        unlock();
    }
 };
 /*
 * This one returns the least-occupied pipe.
 */
 class PipeDispenserNonBlocking : public PipeDispenser  {
 protected:
    std::map<int,int> _pipeUsers;
 public:
    PipeDispenserNonBlocking(const seti& pipes) : PipeDispenser(pipes) {
        for (seti::iterator it = pipes.begin(); it != pipes.end(); ++it) {
            _pipeUsers[*it] = 0;
        }
    }
    int getPipe(const seti& interested) {
        lock();
        int pipe = -1, users = 1 << 30;
        for (seti::iterator it = _pipes.begin(); it != _pipes.end(); ++it) {
            if (interested.count(*it) > 0 && _pipeUsers[*it] < users) {
                pipe = *it;
                users = _pipeUsers[*it];
            }
        }
        if (pipe >= 0) {
            _pipeUsers[pipe]++;
        }
        unlock();
        return pipe;
    }
    void freePipe(int pipe) {
        lock();
        _pipeUsers[pipe]--;
        unlock();
    }
 };
 #endif /* PIPEDISPENSER_CUH_ */
--- a/include/pyconvnet.cuh
+++ b/include/pyconvnet.cuh
@ -0,0 +1,43 @@
 /* 
 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 * 
 * - Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef PYCONVNET3_CUH
 #define	PYCONVNET3_CUH
 #define _QUOTEME(x) #x
 #define QUOTEME(x) _QUOTEME(x)
 extern "C" void INITNAME();
 PyObject* initModel(PyObject *self, PyObject *args);
 PyObject* startBatch(PyObject *self, PyObject *args);
 PyObject* finishBatch(PyObject *self, PyObject *args);
 PyObject* checkGradients(PyObject *self, PyObject *args);
 PyObject* syncWithHost(PyObject *self, PyObject *args);
 PyObject* startMultiviewTest(PyObject *self, PyObject *args);
 PyObject* startFeatureWriter(PyObject *self, PyObject *args);
 PyObject* startDataGrad(PyObject *self, PyObject *args);
 #endif
--- a/include/quantizer.cuh
+++ b/include/quantizer.cuh
@ -0,0 +1,43 @@
 /*
 * quantizer.cuh
 *
 *  Created on: 2013-02-15
 *      Author: spoon
 */
 #ifndef QUANTIZER_CUH_
 #define QUANTIZER_CUH_
 #include <Python.h>
 #include <util.cuh>
 #include <string>
 #include <nvmatrix.cuh>
 #include <conv_util.cuh>
 class Quantizer {
 protected:
    NVMatrix* _quantized;
    int _numRows, _numCols;
    bool _trans;
    virtual void _quantize(NVMatrix& src, NVMatrix& tgt);
    virtual void _dequantize(NVMatrix& tgt, float scaleTarget, float scaleOutput);
 public:
    Quantizer();
    virtual ~Quantizer();
    void quantize(NVMatrix& src, NVMatrix& tgt);
    void dequantize(NVMatrix& tgt);
    void dequantize(NVMatrix& tgt, float scaleTarget, float scaleOutput);
    static Quantizer& make(PyObject* qDict);
 };
 class HalfQuantizer : public Quantizer {
 protected:
    void _quantize(NVMatrix& src, NVMatrix& tgt);
    void _dequantize(NVMatrix& tgt, float scaleTarget, float scaleOutput);
 public:
    HalfQuantizer();
 };
 #endif /* QUANTIZER_CUH_ */
--- a/include/softmaxtree.cuh
+++ b/include/softmaxtree.cuh
@ -0,0 +1,144 @@
 /* 
 * File:   softmaxtree.h
 * Author: Alex Krizhevsky
 *
 * Created on September 9, 2012, 5:50 PM
 */
 #ifndef SOFTMAXTREE_H
 #define	SOFTMAXTREE_H
 #include <helper_cuda.h>
 #include <string>
 #include <map>
 #include <vector>
 #include <algorithm>
 #include <assert.h>
 #include <nvmatrix.cuh>
 #include <matrix.h>
 class SoftmaxNode;
 class SoftmaxTree;
 typedef std::vector<SoftmaxNode*> SoftmaxNodeV;
 class SoftmaxNode {
    friend class SoftmaxTree;
 protected:
    SoftmaxNodeV _children;
    SoftmaxNode* _parent;
    int _depth, _height, _size;
    int _label;
    /*
     * Computes height for entire subtree rooted at this node and populates
     * given height->nodes map.
     */
    int setDistances(std::map<int, SoftmaxNodeV*>& nodeHeights,
                     std::map<int, SoftmaxNodeV*>& nodeDepths);
    void setNodeCounts(int &nodes, int& leaves);
    /*
     * Compute the number of leaves in this subtree, which is a good estimate
     * of the number of training cases it represents.
     */
    int setSizes(ushort* nodeSizes);
 public:
    SoftmaxNode(SoftmaxNode* parent, int label);
    ~SoftmaxNode();
    SoftmaxNode& addChild(int label);
    int getDepth() const;
    int getHeight() const;
    int getLabel() const;
    int getSize() const;
    SoftmaxNode* getParent(); // Might be null, so must be pointer
    SoftmaxNodeV& getChildren();
 };
 /*
 * numLabels: the number of leaves in the tree (normally 1000)
 * numNodes: the total number of nodes in the tree
 */
 class SoftmaxTree {
    friend class SoftmaxNode;
 protected:
    SoftmaxNode* _root;
    std::map<int, SoftmaxNodeV*> _nodeHeights, _nodeDepths;
    /*
     * Map from depth --> ushort2[]
     * where each ushort2 gives the index and parent index
     * of a node at the given depth.
     */
    std::map<int, ushort2*> _nodeFwdMeta;
    /*
     * Map from height --> ushort2[]
     * where each ushort2 gives the index and number of children
     * of a node at the given height.
     */
    std::map<int, ushort2*> _nodeBwdMeta;
    /*
     * Map from height --> ushort[][]
     * where each ushort[] gives children of a given node at a given height.
     */
    std::map<int, ushort**> _nodeChildMeta;
    /*
     * An array of length numNodes with index i storing the number
     * of leaves in subtree rooted at node with label i.
     */
    ushort* _nodeSizes;
    int _numNodes, _numLeaves;
    void setDistances();
    void setNodeCounts();
    void setNodeSizes();
    void setFwdMeta();
    void setBwdMeta();
    void preprocess(NVMatrix& inp);
    void postprocess(NVMatrix& inp);
 public:
    SoftmaxTree(int rootLabel);
    ~SoftmaxTree();
    void finalize();
    SoftmaxNode& getRoot();
    SoftmaxNodeV& getNodesAtHeight(int height);
    SoftmaxNodeV& getNodesAtDepth(int depth);
    int getHeight() const;
    int getDepth() const;
    int getNumLeaves() const;
    int getNumNodes() const;
    /*
     * offsets: (numNodes, numFeatures)
     * targets: (numNodes, numFeatures) 
     */
    void makeWeights(NVMatrix& offsets, NVMatrix& targets);
    /*
     * grads: (numNodes, numFeatures)
     * 
     * The idea is that grads contains gradients for the leaves 
     * (i.e. the first numLabels rows), so this routine will
     * distribute them up the tree.
     */
    void distributeGradients(NVMatrix& grads);
    /*
     * inc := mom * inc - wc * epsW * weight + epsW * grad
     * weight := weight + inc
     * 
     * weights: (numNodes, numFeatures)
     * incs:    (numNodes, numFeatures)
     * grads:   (numNodes , numFeatures)
     */
    void updateWeights(NVMatrix& weights, NVMatrix& incs, NVMatrix& grads, float epsWBase, float mom, float wcBase);
 };
 #endif	/* SOFTMAXTREE_H */
--- a/include/util.cuh
+++ b/include/util.cuh
@ -0,0 +1,113 @@
 /* 
 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 * 
 * - Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef UTIL_H
 #define	UTIL_H
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <vector>
 #include <map>
 #include <set>
 #include <string>
 #include <sstream>
 #include <string>
 #include <Python.h>
 #include <nvmatrix.cuh>
 #include <matrix.h>
 /*
 * The types of passes that the convnet supports. Used in the fprop and bprop functions in
 * ConvNet class. Most of the layers ignore the pass type, but some make use of it.
 */
 //enum PASS_TYPE {PASS_TRAIN,
 //                PASS_TEST,
 //                PASS_GC,
 //                PASS_MULTIVIEW_TEST,
 //                PASS_MULTIVIEW_TEST_START,
 //                PASS_MULTIVIEW_TEST_END,
 //                PASS_FEATURE_GEN};
 #define PASS_TYPE                   uint
 #define PASS_TRAIN                  0x1
 #define PASS_TEST                   0x2
 #define PASS_GC                     0x4
 #define PASS_MULTIVIEW_TEST         (PASS_TEST | 0x8)
 #define PASS_MULTIVIEW_TEST_START   (PASS_MULTIVIEW_TEST | 0x10)
 #define PASS_MULTIVIEW_TEST_END     (PASS_MULTIVIEW_TEST | 0x20)
 #define PASS_FEATURE_GEN            0x40
 #define HAS_FLAG(f, x)              (((x) & (f)) == (f))
 #define IS_MULTIVIEW_TEST(x)        HAS_FLAG(PASS_MULTIVIEW_TEST, x)
 #define IS_MULTIVIEW_TEST_START(x)  HAS_FLAG(PASS_MULTIVIEW_TEST_START, x)
 #define IS_MULTIVIEW_TEST_END(x)    HAS_FLAG(PASS_MULTIVIEW_TEST_END, x)
 // For gradient checking
 #define GC_SUPPRESS_PASSES          false
 #define GC_REL_ERR_THRESH           0.02
 /*
 * Generates a random floating point number in the range 0-1.
 */
 #define randf                       ((float)rand() / RAND_MAX)
 typedef std::vector<Matrix*> MatrixV;
 typedef std::vector<NVMatrix*> NVMatrixV;
 typedef std::map<std::string,std::vector<double>*> CostMap;
 typedef std::map<std::string,double> CostCoeffMap;
 typedef std::vector<double> doublev;
 typedef std::vector<float> floatv;
 typedef std::vector<int> intv;
 typedef std::vector<std::string> stringv;
 typedef std::set<int> seti;
 stringv* getStringV(PyObject* pyList);
 floatv* getFloatV(PyObject* pyList);
 intv* getIntV(PyObject* pyList);
 MatrixV* getMatrixV(PyObject* pyList);
 MatrixV* getMatrixV(PyObject* pyList, int len);
 int* getIntA(PyObject* pyList);
 int pyDictGetInt(PyObject* dict, const char* key);
 intv* pyDictGetIntV(PyObject* dict, const char* key);
 std::string pyDictGetString(PyObject* dict, const char* key);
 float pyDictGetFloat(PyObject* dict, const char* key);
 floatv* pyDictGetFloatV(PyObject* dict, const char* key);
 Matrix* pyDictGetMatrix(PyObject* dict, const char* key);
 MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key);
 int* pyDictGetIntA(PyObject* dict, const char* key);
 stringv* pyDictGetStringV(PyObject* dict, const char* key);
 template<typename T>
 std::string tostr(T n) {
    std::ostringstream result;
    result << n;
    return result.str();
 }
 #endif	/* UTIL_H */
--- a/include/weights.cuh
+++ b/include/weights.cuh
@ -0,0 +1,150 @@
 /* 
 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 * 
 * - Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef WEIGHTS_CUH
 #define	WEIGHTS_CUH
 #include <string>
 #include <vector>
 #include <iostream>
 #include <helper_cuda.h>
 #include <assert.h>
 #include <nvmatrix.cuh>
 #include <matrix.h>
 #include "util.cuh"
 #include "softmaxtree.cuh"
 #include <lr.cuh>
 using namespace std;
 class Weights {
 protected:
    Matrix* _hWeights, *_hWeightsInc;
    NVMatrix* _weights, *_weightsInc, *_weightsGrad;
    NVMatrix* _weightsGradAvg, *_weightsGrad2Avg;
    LearningRateSchedule* _lrs;
    float _wc, _mom, _wball, _superEps;
    bool _onGPU, _useGrad, _cleanup;
    int _numUpdates;
    // Non-NULL if these weights are really shared from some other layer
    Weights* _srcWeights;
 public:
    class Grad2AvgOperator {
    private:
        float _mom;
    public:
        Grad2AvgOperator(float mom) : _mom(mom) {
        }
        __device__ inline float operator()(const float G2, const float g) const {
            return _mom * G2 + (1.0f - _mom) * g * g;
        }
    };
    NVMatrix& operator*() const;
    Weights(Weights& srcWeights, LearningRateSchedule& lrs);
    Weights(Matrix& hWeights, Matrix& hWeightsInc, LearningRateSchedule& lrs, float wc, float wball, float mom, float superEps, bool useGrad, bool cleanup=true);
    virtual ~Weights();
    virtual NVMatrix& getW() const;
    virtual NVMatrix& getInc() const;
    virtual NVMatrix& getGrad() const;
    virtual Matrix& getCPUW() const;
    virtual Matrix& getCPUWInc() const;
    virtual LearningRateSchedule& getLearningRateSchedule() const;
    virtual int getNumRows() const;
    virtual int getNumCols() const;
    virtual void copyToCPU();
    // This function is assumed to be called in the order in which the layers
    // were defined
    virtual void copyToGPU();
    virtual void update(float progress);
    int incNumUpdates();
    // Returns the number of times a gradient has been computed for this
    // weight matrix during the current pass (interval between two calls of update())
    // through the net. This number will only be greater than 1 if this weight matrix
    // is *shared* by multiple layers in the net.
    int getNumUpdates() const;
    float getEps(float progress) const;
    float getMom() const;
    float getWC() const;
    float getWBall() const;
    bool isUseGrad() const;
    bool isOwner() const;
    float getSuperEps() const;
 };
 class TreeWeights : public Weights {
 protected:
    NVMatrix _effWeights;
    NVMatrix* _leafWeights, *_leafGrad, *_leafInc;
    SoftmaxTree* _tree;
 public:
    void copyToGPU();
    void update(float progress);
    NVMatrix& getW() const;
    NVMatrix& getInc() const;
    NVMatrix& getGrad() const;
    NVMatrix& getAllW() const;
    NVMatrix& getAllInc() const;
    NVMatrix& getAllGrad() const;
    int getNumRows() const;
    void makeWeights();
    void distributeGradients();
    TreeWeights(SoftmaxTree& tree, Matrix& hWeights, Matrix& hWeightsInc, LearningRateSchedule& lrs, float wcBase, float mom);
 };
 class DummyWeights : public Weights {
 public:
    DummyWeights(Matrix& hWeights, Matrix& hWeightsInc, NVMatrix& weights, NVMatrix& incs, NVMatrix& grads);
 };
 class WeightList {
 private:
    std::vector<Weights*> _weightList;
 public:
    Weights& operator[](const int idx) const;
    ~WeightList();
    WeightList();
    void addWeights(Weights& w);
    void update(float progress);
    void copyToCPU();
    void copyToGPU();
    int getSize() const;
 };
 #endif	/* WEIGHTS_CUH */
--- a/include/worker.cuh
+++ b/include/worker.cuh
@ -0,0 +1,122 @@
 /* 
 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 * 
 * - Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef WORKER_CUH
 #define	WORKER_CUH
 #include "convnet.cuh"
 #include "cost.cuh"
 #include "data.cuh"
 class ConvNet;
 class Cost;
 class WorkResult {
 public:
    enum RESULTS {BATCH_DONE, SYNC_DONE};
 protected:
    WorkResult::RESULTS _resultType;
    Cost* _results;
 public:
    WorkResult(WorkResult::RESULTS resultType, Cost& results);
    WorkResult(WorkResult::RESULTS resultType);
    virtual ~WorkResult();
    Cost& getResults() const;
    WorkResult::RESULTS getResultType() const;
 };
 class Worker {
 protected:
    ConvNet* _convNet;
 public:
    Worker(ConvNet& convNet);
    virtual void run() = 0;
 };
 class DataWorker : public Worker {
 protected:
    CPUData* _data;
    DataProvider* _dp;
 public:
    DataWorker(ConvNet& convNet, CPUData& data);
    virtual ~DataWorker();
 };
 class TrainingWorker : public DataWorker {
 protected:
    bool _test;
    double _progress;
 public:
    TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test);
    void run();
 };
 class SyncWorker : public Worker {
 public:
    SyncWorker(ConvNet& convNet);
    void run();
 };
 class GradCheckWorker : public DataWorker {
 public:
    GradCheckWorker(ConvNet& convNet, CPUData& data);
    void run();
 };
 class MultiviewTestWorker : public DataWorker {
 protected:
    int _numViews;
    Matrix* _cpuProbs;
    std::string _logregName;
 public:
    MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* softmaxName);
    MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews);
    ~MultiviewTestWorker();
    virtual void run();
 };
 class FeatureWorker : public DataWorker {
 protected:
    MatrixV *_ftrs;
    stringv *_layerNames;
 public:
    FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames);
    ~FeatureWorker();
    void run();
 };
 class DataGradWorker : public DataWorker {
 protected:
    Matrix* _dataGrads;
    int _dataLayerIdx, _softmaxLayerIdx;
 public:
    DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx);
    ~DataGradWorker();
    void run();
 };
 #endif	/* WORKER_CUH */
--- a/initw.py
+++ b/initw.py
@ -0,0 +1,21 @@
 from gpumodel import *
 import numpy as n
 import numpy.random as nr
 def get_src():
    src = IGPUModel.load_checkpoint('/nobackup/kriz/tmp/ConvNet__2012-09-19_23.29.04')
    return src['model_state']['layers']
 def makew(name, idx, shapes, params):
    src, src_layer = get_src(), params[0]
    if name == 'localcombine' and idx == 2:
        return n.array(0.01 * nr.randn(shapes[0], shapes[1]), dtype=n.single, order='C')
    return src[src_layer]['weights'][idx]
 def makeb(name, shapes, params):
    src, src_layer = get_src(), params[0]
    return src[src_layer]['biases']
 def makec(name, idx, shapes, params):
    src, src_layer = get_src(), params[0]
    return src[src_layer]['filterConns'][idx]
--- a/layer.py
+++ b/layer.py
--- a/layers-cifar/layer-params-18pct-noisylr.cfg
+++ b/layers-cifar/layer-params-18pct-noisylr.cfg
@ -0,0 +1,47 @@
 # 18% error on CIFAR-10 in 20 minutes - layer definition file 
 # Reduce all learning rates by factor of 10 after 120 epochs.
 # Then another factor of 10 after 10 more epochs.
 [conv1]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 schedW=linear[1,1]
 [conv2]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 schedW=linear[1,1]
 [conv3]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 schedW=linear[1,1]
 [fc10]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=1
 schedW=linear[1,1]
 [logprob]
 coeff=1
 [rnorm1]
 scale=0.00005
 pow=.75
 [rnorm2]
 scale=0.00005
 pow=.75
--- a/layers-cifar/layer-params-conv-local-13pct-noisylr.cfg
+++ b/layers-cifar/layer-params-conv-local-13pct-noisylr.cfg
@ -0,0 +1,45 @@
 # 13% error on CIFAR-10 - layer parameter file 
 # See methodology: http://code.google.com/p/cuda-convnet/wiki/Methodology
 [conv1]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.00
 schedW=linear[1,1]
 [conv2]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.00
 schedW=linear[1,1]
 [local3]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 schedW=linear[1,1]
 [local4]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 schedW=linear[1,1]
 [fc10]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.004
 schedW=linear[1,1]
 [logprob]
 coeff=1
--- a/layers-cifar/layers-18pct.cfg
+++ b/layers-cifar/layers-18pct.cfg
@ -0,0 +1,106 @@
 # 18% error on CIFAR-10 in 20 minutes - layer definition file 
 [data]
 type=data
 dataIdx=0
 [labels]
 type=data
 dataIdx=1
 [conv1]
 type=conv
 inputs=data
 channels=3
 filters=32
 padding=2
 stride=1
 filterSize=5
 initW=0.0001
 partialSum=4
 sharedBiases=1
 gpu=0
 [pool1]
 type=pool
 pool=max
 inputs=conv1
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=32
 neuron=relu
 [rnorm1]
 type=rnorm
 inputs=pool1
 channels=32
 size=3
 [conv2]
 type=conv
 inputs=rnorm1
 filters=32
 padding=2
 stride=1
 filterSize=5
 channels=32
 neuron=relu
 initW=0.01
 partialSum=4
 sharedBiases=1
 [pool2]
 type=pool
 pool=avg
 inputs=conv2
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=32
 [rnorm2]
 type=rnorm
 inputs=pool2
 channels=32
 size=3
 [conv3]
 type=conv
 inputs=rnorm2
 filters=64
 padding=2
 stride=1
 filterSize=5
 channels=32
 neuron=relu
 initW=0.01
 partialSum=4
 sharedBiases=1
 [pool3]
 type=pool
 pool=avg
 inputs=conv3
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=64
 [fc10]
 type=fc
 outputs=10
 inputs=pool3
 initW=0.01
 [probs]
 type=softmax
 inputs=fc10
 [logprob]
 type=cost.logreg
 inputs=labels,probs
 gpu=0
--- a/layers-cifar/layers-conv-local-13pct.cfg
+++ b/layers-cifar/layers-conv-local-13pct.cfg
@ -0,0 +1,95 @@
 # 13% error on CIFAR-10 in 20 minutes - layer definition file 
 # See methodology: http://code.google.com/p/cuda-convnet/wiki/Methodology
 [data]
 type=data
 dataIdx=0
 [labels]
 type=data
 dataIdx=1
 [conv1]
 type=conv
 inputs=data
 channels=3
 filters=64
 padding=2
 stride=1
 filterSize=5
 neuron=relu
 initW=0.0001
 partialSum=4
 sharedBiases=1
 gpu=0
 [pool1]
 type=pool
 pool=max
 inputs=conv1
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=64
 [conv2]
 type=conv
 inputs=pool1
 filters=64
 padding=2
 stride=1
 filterSize=5
 channels=64
 neuron=relu
 initW=0.01
 partialSum=8
 sharedBiases=1
 [pool2]
 type=pool
 pool=max
 inputs=conv2
 start=0
 sizeX=3
 stride=2
 outputsX=0
 channels=64
 [local3]
 type=local
 inputs=pool2
 filters=32
 padding=1
 stride=1
 filterSize=3
 channels=64
 neuron=relu
 initW=0.04
 [local4]
 type=local
 inputs=local3
 filters=32
 padding=1
 stride=1
 filterSize=3
 channels=32
 neuron=relu
 initW=0.04
 [fc10]
 type=fc
 outputs=10
 inputs=local4
 initW=0.01
 neuron=ident
 [probs]
 type=softmax
 inputs=fc10
 [logprob]
 type=cost.logreg
 inputs=labels,probs
 gpu=0
--- a/layers/layer-params-100.cfg
+++ b/layers/layer-params-100.cfg
@ -0,0 +1,157 @@
 [conv1a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv1b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv2a]
 epsW=0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005
 wball=0,0,0
 [conv2b]
 epsW=0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005
 wball=0,0,0
 [conv3a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.001
 pow=0.25
 [rnorm1b]
 scale=0.001
 pow=0.25
 [rnorm2a]
 scale=0.001
 pow=0.25
 [rnorm2b]
 scale=0.001
 pow=0.25
 # on guppy7
 # this is like #97 (on gpu) but with different rnorm coeffs
 # /nobackup/kriz/tmp/ConvNet__2012-06-27_14.03.18
 # epoch 15: set epsw to 0.001 from 0.01
 # epoch 43: killed, seems slightly worse than using my old rnorm coeffs
--- a/layers/layer-params-106.cfg
+++ b/layers/layer-params-106.cfg
@ -0,0 +1,184 @@
 [conv1a]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #101 but with contrast normalization layers over rnorm2
 # on guppy7
 # logs/layers-106.log
 # /nobackup/kriz/tmp/ConvNet__2012-07-07_21.11.34
 # epoch 22: set epsw to 0.001 from 0.01
 # epoch 31: killed, turns out weight contrast normalization is better
 # restart after fixing cnorm
 # on guppy9
 # logs/layers-106a.log
 # /nobackup/kriz/tmp/ConvNet__2012-07-17_19.06.09
 # epoch 21: set epsw to 0.001 from 0.01
 # restart with proper learning rate
 # logs/layers-106b.log
 # /nobackup/kriz/tmp/ConvNet__2012-07-19_04.15.40
 # epoch 23: set epsw to 0.001 from 0.01
 # epoch 46: set epsw to 0.0001 from 0.001
 # epoch 61: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 72: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 79: set epsw to 0.00001 from 0.0001
 # epoch 93: killed
 # [1.5942473039940013, 0.3705782743769917, 0.16672222296297284]
--- a/layers/layer-params-107.cfg
+++ b/layers/layer-params-107.cfg
@ -0,0 +1,167 @@
 [conv1a]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 wcnorm=0.00
 wcNormMin=0.001
 wcNormMax=0.002
 [conv1b]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 wcnorm=0.00
 wcNormMin=0.001
 wcNormMax=0.002
 [conv2a]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv2b]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv3a]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 # this is like #101 but uses wcnorm in conv1/conv2. so it uses def file #101.
 # it's also like #104, but #104 only does wcnorm on conv2
 # on guppy7
 # logs/layers-107.log
 # /nobackup/kriz/tmp/ConvNet__2012-07-09_19.20.14
--- a/layers/layer-params-109.cfg
+++ b/layers/layer-params-109.cfg
@ -0,0 +1,187 @@
 [conv1a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 wcnorm=0.00
 [conv1b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 wcnorm=0.00
 [conv2a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv2b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv3a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc1536a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1536b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1536ba]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1536bb]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1536ca]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1536cb]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs3a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [hs3b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 # this is like #101 but uses wcnorm in conv2 and also has 3 fc layers. 
 # on guppy9
 # logs/layers-109.log
 # /nobackup/kriz/tmp/ConvNet__2012-07-10_00.46.52
 # epoch 17: set epsw to 0.001 from 0.01
 # epoch 26: enabled dropout on hs3a,hs3b
 # epoch 27: killed -- overfitting as feared
--- a/layers/layer-params-110.cfg
+++ b/layers/layer-params-110.cfg
@ -0,0 +1,146 @@
 [conv1a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 # this is like #101 but without rnorm. it's about time i found out how helpful it is to modern nets
 # on guppy7
 # logs/layers-110.log
 # /nobackup/kriz/tmp/ConvNet__2012-07-11_00.26.55
 # epoch 19: set epsw to 0.001 from 0.01
 # epoch 46: set epsw to 0.0001 from 0.001
 # epoch 67: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 66: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 75: killed, it looks to be about 1% worse than #101
--- a/layers/layer-params-111.cfg
+++ b/layers/layer-params-111.cfg
@ -0,0 +1,187 @@
 [conv1a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 wcnorm=0.00
 [conv1b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 wcnorm=0.00
 [conv2a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv2b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv3a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ca]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048cb]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs3a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [hs3b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 # this is like #101 but uses wcnorm in conv2 and also has 3 fc layers. 
 # its also like #109 but uses wider fc layers with dropout in all cos 109 overfit
 # on guppy9
 # logs/layers-111.log
 # /nobackup/kriz/tmp/ConvNet__2012-07-12_23.59.48
 # epoch 19: set epsw to 0.001 from 0.01
 # epoch 42: this is quite a bit worse, and in an underfitting way, so i'm starting #104, which will be like this but the fc layers will be 3072 each instead of 2048
--- a/layers/layer-params-112.cfg
+++ b/layers/layer-params-112.cfg
@ -0,0 +1,163 @@
 [conv1a]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 # on guppy7
 # this is like #101 but with rnorm region of size 5 instead of 9
 # logs/layers-112.log
 # epoch 22: set epsw to 0.001 from 0.01
 # epoch 46: set epsw to 0.0001 from 0.001
 # epoch 66: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 71: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 79: set epsw to 0.00001 from 0.0001
 # epoch 90: killed
 # [1.6064990917001289, 0.37237829837731168, 0.16815557540767209]
--- a/layers/layer-params-113.cfg
+++ b/layers/layer-params-113.cfg
@ -0,0 +1,154 @@
 [conv1a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv3a]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [conv3b]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [conv4a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 # this is like #101 but with conv3 taking both conv2 and conv1 as input, and conv2 just taking the low res img as input
 # on guppy9
--- a/layers/layer-params-114.cfg
+++ b/layers/layer-params-114.cfg
@ -0,0 +1,187 @@
 [conv1a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 wcnorm=0.00
 [conv1b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 wcnorm=0.00
 [conv2a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv2b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv3a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc3072a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc3072b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc3072ba]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc3072bb]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc3072ca]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc3072cb]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs3a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [hs3b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 # this is like #101 but uses wcnorm in conv2 and also has 3 fc layers, with width 6144. 
 # on guppy9
 # logs/layers-114.log
 # 140523240 params (incl biases)
 # /nobackup/kriz/tmp/ConvNet__2012-07-15_14.56.24
 # epoch 20: set epsw to 0.001 from 0.01
 # epoch 40: killed, doing worse than 115 which is the same but has only 2 fc layers
--- a/layers/layer-params-115-jpeg.cfg
+++ b/layers/layer-params-115-jpeg.cfg
@ -0,0 +1,181 @@
 [conv1a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 wcnorm=0.00
 [conv1b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 wcnorm=0.00
 [conv2a]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv2b]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv3a]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc3072a]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc3072b]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc3072ba]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc3072bb]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc3072ca]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc3072cb]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs1b]
 enable=true
 [hs2a]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 # this is like 115 (on gpu) but trained on ilya's new imgnet-2010 jpeg
 # on guppy7
 # logs/layers-115-jpeg.log
 # /nobackup/kriz/tmp/ConvNet__2012-07-18_20.56.13
 # epoch 22: set epsw to 0.001 from 0.01
 # epoch 48: set epsw to 0.0001 from 0.001
 # epoch 58: killed, since this was a duplicate (jpeg) of a suboptimal net anyway 
--- a/layers/layer-params-116.cfg
+++ b/layers/layer-params-116.cfg
@ -0,0 +1,303 @@
 [conv1a]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1c]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1d]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.01,0.01
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv2b]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv2c]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv2d]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv3a]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3c]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3d]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4c]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4d]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5c]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5d]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc1024a]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024b]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024c]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024d]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2a]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2b]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2c]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2d]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1000]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [hs1c]
 enable=true
 [hs2c]
 enable=true
 [hs1d]
 enable=true
 [hs2d]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm1c]
 scale=0.0001
 pow=0.75
 [rnorm1d]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [rnorm2c]
 scale=0.0001
 pow=0.75
 [rnorm2d]
 scale=0.0001
 pow=0.75
 # on guppy8
 # this is like 112, but has wcnorm in conv2, and also its on 4 gpus
--- a/layers/layer-params-117.cfg
+++ b/layers/layer-params-117.cfg
@ -0,0 +1,279 @@
 [conv1a]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1c]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1d]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.01,0.01
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv2b]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv2c]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv2d]
 epsW=0.01,0.01
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv3a]
 epsW=0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005
 wball=0,0,0
 [conv3b]
 epsW=0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005
 wball=0,0,0
 [conv3c]
 epsW=0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005
 wball=0,0,0
 [conv4a]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4c]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4d]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5c]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5d]
 epsW=0.01
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc1408a]
 epsW=0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005
 wball=0,0,0
 [fc1408b]
 epsW=0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005
 wball=0,0,0
 [fc1408c]
 epsW=0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005
 wball=0,0,0
 [fc1408-2a]
 epsW=0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005
 wball=0,0,0
 [fc1408-2b]
 epsW=0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005
 wball=0,0,0
 [fc1408-2c]
 epsW=0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005
 wball=0,0,0
 [fc1000]
 epsW=0.01,0.01,0.01
 epsB=0.002
 momW=0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005
 wball=0,0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [hs1c]
 enable=true
 [hs2c]
 enable=true
 [hs1d]
 enable=true
 [hs2d]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm1c]
 scale=0.0001
 pow=0.75
 [rnorm1d]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [rnorm2c]
 scale=0.0001
 pow=0.75
 [rnorm2d]
 scale=0.0001
 pow=0.75
 # on guppy8
 # this is like 112, but has wcnorm in conv2, and also its on 4 gpus
--- a/layers/layer-params-118.cfg
+++ b/layers/layer-params-118.cfg
@ -0,0 +1,168 @@
 [conv1a]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv3a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 # on guppy7
 # this is like #112 but with wcnorm on conv2, and also trained on jpeg
 # logs/layers-118.log
 # /nobackup/kriz/tmp/ConvNet__2012-07-19_18.35.31
 # epoch 23: set epsw to 0.001 from 0.01
 # epoch 46: set epsw to 0.0001 from 0.001
 # epoch 65: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 75: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 84: set epsw to 0.00001 from 0.0001
 # epcoh 98: killed
 # [1.640873252105713, 0.37831333333333333, 0.17355999999999999]
--- a/layers/layer-params-120-2012-full.cfg
+++ b/layers/layer-params-120-2012-full.cfg
@ -0,0 +1,174 @@
 [conv1a]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #120 (so uses def file #120) but trained on lsvrc-2012 (full)
 # on gpu
 # /storage/tmp/ConvNet__2012-07-26_04.06.44
 # logs/layers-120-2012-full.log
 # epoch 23: set epsw to 0.001 from 0.01
 # epoch 38: moved to guppy9
 # /nobackup/kriz/tmp/ConvNet__2012-07-26_04.06.44
 # epoch 49: set epsw to 0.0001 from 0.001
 # epoch 66: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 73: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 87: set epsw to 0.00001 from 0.0001
 # epoch 94: killed
 # 
--- a/layers/layer-params-120-2012.cfg
+++ b/layers/layer-params-120-2012.cfg
@ -0,0 +1,173 @@
 [conv1a]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #120 (so uses def file #120) but trained on lsvrc-2012 (non-full)
 # on guppy9
 # /nobackup/kriz/tmp/ConvNet__2012-07-24_23.16.15
 # epoch 22: set epsw to 0.001 from 0.01
 # epoch 49: set epsw to 0.0001 from 0.001
 # epoch 66: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 73: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 81: set epsw to 0.00001 from 0.0001
 # epoch 95: killed
 # validation multiview error:
 # logprob:  1.765247, 0.410440, 0.187140 
--- a/layers/layer-params-120-4gpu-auto2.cfg
+++ b/layers/layer-params-120-4gpu-auto2.cfg
@ -0,0 +1,313 @@
 [conv1a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1c]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1d]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2c]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2d]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3c]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3d]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4c]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4d]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5c]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5d]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc1024-1a]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1b]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1c]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1d]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2a]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2b]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2c]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2d]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1000]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs1b]
 enable=true
 [hs1c]
 enable=true
 [hs1d]
 enable=true
 [hs2a]
 enable=true
 [hs2b]
 enable=true
 [hs2c]
 enable=true
 [hs2d]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm1c]
 scale=0.0001
 pow=0.75
 [rnorm1d]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [rnorm2c]
 scale=0.0001
 pow=0.75
 [rnorm2d]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 [cnorm2c]
 scale=0.001
 pow=0.75
 [cnorm2d]
 scale=0.001
 pow=0.75
 # this is like #120 but on 4 gpus. trained on 2012 (non-full)
 # on guppy
 # logs/layers-120-4gpu.log
 # /ais/gobi3/u/kriz/tmp/ConvNet__2012-08-03_14.28.23
--- a/layers/layer-params-120-4gpu-auto3.cfg
+++ b/layers/layer-params-120-4gpu-auto3.cfg
@ -0,0 +1,313 @@
 [conv1a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1c]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1d]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2c]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2d]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3c]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3d]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4c]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4d]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5c]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5d]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc1024-1a]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1b]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1c]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1d]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2a]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2b]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2c]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2d]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1000]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs1b]
 enable=true
 [hs1c]
 enable=true
 [hs1d]
 enable=true
 [hs2a]
 enable=true
 [hs2b]
 enable=true
 [hs2c]
 enable=true
 [hs2d]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm1c]
 scale=0.0001
 pow=0.75
 [rnorm1d]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [rnorm2c]
 scale=0.0001
 pow=0.75
 [rnorm2d]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 [cnorm2c]
 scale=0.001
 pow=0.75
 [cnorm2d]
 scale=0.001
 pow=0.75
 # this is like #120 but on 4 gpus. trained on 2012 (non-full)
 # on guppy
 # logs/layers-120-4gpu.log
 # /ais/gobi3/u/kriz/tmp/ConvNet__2012-08-03_14.28.23
--- a/layers/layer-params-120-4gpu-auto4.cfg
+++ b/layers/layer-params-120-4gpu-auto4.cfg
@ -0,0 +1,313 @@
 [conv1a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1c]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1d]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2c]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2d]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3c]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3d]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4c]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4d]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5c]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5d]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc1024-1a]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1b]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1c]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1d]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2a]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2b]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2c]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2d]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1000]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs1b]
 enable=true
 [hs1c]
 enable=true
 [hs1d]
 enable=true
 [hs2a]
 enable=true
 [hs2b]
 enable=true
 [hs2c]
 enable=true
 [hs2d]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm1c]
 scale=0.0001
 pow=0.75
 [rnorm1d]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [rnorm2c]
 scale=0.0001
 pow=0.75
 [rnorm2d]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 [cnorm2c]
 scale=0.001
 pow=0.75
 [cnorm2d]
 scale=0.001
 pow=0.75
 # this is like #120 but on 4 gpus. trained on 2012 (non-full)
 # on guppy
 # logs/layers-120-4gpu.log
 # /ais/gobi3/u/kriz/tmp/ConvNet__2012-08-03_14.28.23
--- a/layers/layer-params-120-4gpu-auto5.cfg
+++ b/layers/layer-params-120-4gpu-auto5.cfg
@ -0,0 +1,313 @@
 [conv1a]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1c]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1d]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2c]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2d]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3c]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3d]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4c]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4d]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5c]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5d]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc1024-1a]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1b]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1c]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1d]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2a]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2b]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2c]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2d]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1000]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs1b]
 enable=true
 [hs1c]
 enable=true
 [hs1d]
 enable=true
 [hs2a]
 enable=true
 [hs2b]
 enable=true
 [hs2c]
 enable=true
 [hs2d]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm1c]
 scale=0.0001
 pow=0.75
 [rnorm1d]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [rnorm2c]
 scale=0.0001
 pow=0.75
 [rnorm2d]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 [cnorm2c]
 scale=0.001
 pow=0.75
 [cnorm2d]
 scale=0.001
 pow=0.75
 # this is like #120 but on 4 gpus. trained on 2012 (non-full)
 # on guppy
 # logs/layers-120-4gpu.log
 # /ais/gobi3/u/kriz/tmp/ConvNet__2012-08-03_14.28.23
--- a/layers/layer-params-120-4gpu-auto6.cfg
+++ b/layers/layer-params-120-4gpu-auto6.cfg
@ -0,0 +1,313 @@
 [conv1a]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1c]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1d]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2c]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2d]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3c]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3d]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4c]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4d]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5c]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5d]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc1024-1a]
 epsW=0.00001,0.00001,0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1b]
 epsW=0.00001,0.00001,0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1c]
 epsW=0.00001,0.00001,0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1d]
 epsW=0.00001,0.00001,0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2a]
 epsW=0.00001,0.00001,0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2b]
 epsW=0.00001,0.00001,0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2c]
 epsW=0.00001,0.00001,0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2d]
 epsW=0.00001,0.00001,0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1000]
 epsW=0.00001,0.00001,0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs1b]
 enable=true
 [hs1c]
 enable=true
 [hs1d]
 enable=true
 [hs2a]
 enable=true
 [hs2b]
 enable=true
 [hs2c]
 enable=true
 [hs2d]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm1c]
 scale=0.0001
 pow=0.75
 [rnorm1d]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [rnorm2c]
 scale=0.0001
 pow=0.75
 [rnorm2d]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 [cnorm2c]
 scale=0.001
 pow=0.75
 [cnorm2d]
 scale=0.001
 pow=0.75
 # this is like #120 but on 4 gpus. trained on 2012 (non-full)
 # on guppy
 # logs/layers-120-4gpu.log
 # /ais/gobi3/u/kriz/tmp/ConvNet__2012-08-03_14.28.23
--- a/layers/layer-params-120-4gpu.cfg
+++ b/layers/layer-params-120-4gpu.cfg
@ -0,0 +1,314 @@
 [conv1a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1c]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1d]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2c]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2d]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3c]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3d]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4c]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4d]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5c]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5d]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc1024-1a]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1b]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1c]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1d]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2a]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2b]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2c]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2d]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1000]
 epsW=0.01,0.01,0.01,0.01
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs1b]
 enable=true
 [hs1c]
 enable=true
 [hs1d]
 enable=true
 [hs2a]
 enable=true
 [hs2b]
 enable=true
 [hs2c]
 enable=true
 [hs2d]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm1c]
 scale=0.0001
 pow=0.75
 [rnorm1d]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [rnorm2c]
 scale=0.0001
 pow=0.75
 [rnorm2d]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 [cnorm2c]
 scale=0.001
 pow=0.75
 [cnorm2d]
 scale=0.001
 pow=0.75
 # this is like #120 but on 4 gpus. trained on 2012 (non-full)
 # on guppy
 # logs/layers-120-4gpu.log
 # /ais/gobi3/u/kriz/tmp/ConvNet__2012-08-03_14.28.23
 # epoch 56: killed, this is overfitting. will try reducing the # of params.
--- a/layers/layer-params-120.cfg
+++ b/layers/layer-params-120.cfg
@ -0,0 +1,174 @@
 [conv1a]
 epsW=0.0000
 epsB=0.0
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.0
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.0
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.0
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #106 but with rnorm of size 5, also train on jpegs
 # on gpu
 # logs/layers-120.log
 # /storage/tmp/ConvNet__2012-07-22_04.40.34
 # moving to guppy7
 # /nobackup/kriz/tmp/ConvNet__2012-07-22_04.40.34/
 # epoch 26: set epsw to 0.001 from 0.01
 # epoch 47: set epsw to 0.0001 from 0.001
 # epoch 66: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 72: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 82: set epsw to 0.00001 from 0.0001
 # epoch 106: killed
 # logprob:  1.634692, 0.378533, 0.172360
--- a/layers/layer-params-121.cfg
+++ b/layers/layer-params-121.cfg
@ -0,0 +1,179 @@
 [conv1a]
 epsW=0.0000
 epsB=0.0
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.0
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.0
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.0
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 [cnorm1a]
 scale=0.001
 pow=0.75
 [cnorm1b]
 scale=0.001
 pow=0.75
 # this is like #120 but with cnorm over conv1 as well
 # on guppy8
 # logs/layers-121.log
 # /nobackup/kriz/tmp/ConvNet__2012-07-22_15.59.00
 # epoch 25: set epsw to 0.001 from 0.01
 # epoch 51: set epsw to 0.0001 from 0.001
 # epoch 63: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 76: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 90: set epsw to 0.00001 from 0.0001
 # worse than 120
--- a/layers/layer-params-126-2012-full.cfg
+++ b/layers/layer-params-126-2012-full.cfg
@ -0,0 +1,165 @@
 [conv1a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #120-2012-full  but also has horiz reflection for gpu2
 # on guppy8
 # logs/layers-126.log
 # /nobackup/kriz/tmp/ConvNet__2012-07-31_22.55.59
 # killed after 19 epochs..seems no good, and also full sucks we now know
--- a/layers/layer-params-127.cfg
+++ b/layers/layer-params-127.cfg
@ -0,0 +1,174 @@
 [conv1a]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv3b]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4a]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # trained on lsvrc-2012 (full), like #120 but
 # this examines whether communication is necessary at conv3
 # .. meaning it has no communication at conv3
 # on gpu
 # /storage/tmp/ConvNet__2012-08-01_02.35.01
 # logs/layers-127.log
 # killed, since we know now that full sucks.
 # trained on lsvrc-2012 (non-full). like #120 but now also make conv3,conv4 wider to compensate for lost connections
 # on guppy8
 # logs/layers-127a.log
 # /ais/gobi3/u/kriz/tmp/ConvNet__2012-08-02_00.18.36
 # epoch 21: set epsw to 0.001 from 0.01
 # epoch 36: killed, significantly worse than 120
--- a/layers/layer-params-128.cfg
+++ b/layers/layer-params-128.cfg
@ -0,0 +1,167 @@
 [conv1a]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4b]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv5a]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #120 
 # and has narrower columns which communicate more. i'm running this because #127 suggests
 # that communication is good
 # on guppy9
 # logs/layers-128.log
 # epoch 25: set epsw to 0.001 from 0.01
 # on hold
--- a/layers/layer-params-129.cfg
+++ b/layers/layer-params-129.cfg
@ -0,0 +1,316 @@
 [conv1a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1c]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1d]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2c]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2d]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3c]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3d]
 epsW=0.001,0.001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4c]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4d]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5c]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5d]
 epsW=0.001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc1024-1a]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1b]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1c]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1d]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2a]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2b]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2c]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2d]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1000]
 epsW=0.001,0.001,0.001,0.001
 epsB=0.002
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs1b]
 enable=true
 [hs1c]
 enable=true
 [hs1d]
 enable=true
 [hs2a]
 enable=true
 [hs2b]
 enable=true
 [hs2c]
 enable=true
 [hs2d]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm1c]
 scale=0.0001
 pow=0.75
 [rnorm1d]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [rnorm2c]
 scale=0.0001
 pow=0.75
 [rnorm2d]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 [cnorm2c]
 scale=0.001
 pow=0.75
 [cnorm2d]
 scale=0.001
 pow=0.75
 # 4 gpus, based on 120
 # on guppy7
 # logs/layers-129.log
 # /nobackup/kriz/tmp/ConvNet__2012-08-06_22.23.16
 # epoch 22: set epsw to 0.001 from 0.01
 # uhh.. relu wiped this. nice.
--- a/layers/layer-params-130.cfg
+++ b/layers/layer-params-130.cfg
@ -0,0 +1,320 @@
 [conv1a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1c]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1d]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2c]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2d]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3c]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3d]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4c]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4d]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5c]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5d]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc1024-1a]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1b]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1c]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-1d]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2a]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2b]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2c]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1024-2d]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [fc1000]
 epsW=0.0001,0.0001,0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9,0.9,0.9
 momB=0.9
 wc=0.0005,0.0005,0.0005,0.0005
 wball=0,0,0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs1b]
 enable=true
 [hs1c]
 enable=true
 [hs1d]
 enable=true
 [hs2a]
 enable=true
 [hs2b]
 enable=true
 [hs2c]
 enable=true
 [hs2d]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm1c]
 scale=0.0001
 pow=0.75
 [rnorm1d]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [rnorm2c]
 scale=0.0001
 pow=0.75
 [rnorm2d]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 [cnorm2c]
 scale=0.001
 pow=0.75
 [cnorm2d]
 scale=0.001
 pow=0.75
 # this is like #129, but with 2x as many filters in conv2
 # on guppy8
 # /nobackup/kriz/tmp/ConvNet__2012-08-07_13.31.34
 # logs/layers-130.log
 # uhh.. relu wiped this. nice.
 # on guppy9
 # logs/layers-130a.log
 # /nobackup/kriz/tmp/ConvNet__2012-08-09_14.09.20
 # epoch 22: set epsw to 0.001 from 0.01
 # epoch 46: set epsw to 0.0001 from 0.001
 # epoch 62: killed. surprisingly, this is hardly (if at all) better than 2-gpu net
--- a/layers/layer-params-131-2009.cfg
+++ b/layers/layer-params-131-2009.cfg
@ -0,0 +1,172 @@
 [conv1a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # on guppy9
 # logs/layers-131-2009.log
 # /nobackup/kriz/tmp/ConvNet__2012-08-18_15.41.20
 # epoch 7: set epsw to 0.001 from 0.01
 # epoch 14: set epsw to 0.0001 from 0.001
 # epoch 20: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 24: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 31: set epsw to 0.00001 from 0.0001
 # epoch 36: killed
 # logprob:  3.466260, 0.694209, 0.437308
 # a bit worse than previous 2009 thing!  
--- a/layers/layer-params-131.cfg
+++ b/layers/layer-params-131.cfg
@ -0,0 +1,175 @@
 [conv1a]
 epsW=0.0000
 epsB=0.0
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.0
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.0
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.0
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #120, but puts rnorm1 right over conv1 (trained on 2012-nonfull)
 # on gpu
 # /storage/tmp/ConvNet__2012-08-09_12.33.33
 # logs/layers-131.log
 # moved to guppy7
 # /nobackup/kriz/tmp/ConvNet__2012-08-09_12.33.33/
 # epoch 22: set epsw to 0.001 from 0.01
 # epoch 46: set epsw to 0.0001 from 0.001
 # epoch 66: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 75: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 81: set epsw to 0.00001 from 0.0001
 # epoch 100: killed
 # validation multiview error:
 # logprob:  1.755725, 0.409340, 0.185740 
--- a/layers/layer-params-132.cfg
+++ b/layers/layer-params-132.cfg
@ -0,0 +1,179 @@
 [conv1a]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv3b]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4a]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4b]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv5a]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.00001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.00001,0.00001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like 120 but with communication in conv4 instead of conv3
 # on gpu
 # logs/layers-132.log
 # /storage/tmp/ConvNet__2012-08-11_02.23.36
 # epoch 20: set epsw to 0.001 from 0.01
 # epoch 44: set epsw to 0.0001 from 0.001
 # moved to guppy9
 # @#$%&!, killed, i accidentally trained this on full
 # restart:
 # /nobackup/kriz/tmp/ConvNet__2012-08-13_16.47.07
 # logs/layers-132a.log
 # epoch 23: set epsw to 0.001 from 0.01
 # epoch 4x: set epsw to 0.0001 from 0.001
 # epoch 65: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 71: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 88: killed, worse than 131
--- a/layers/layer-params-133.cfg
+++ b/layers/layer-params-133.cfg
@ -0,0 +1,167 @@
 [conv1a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv3b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv5a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 [rnorm1b]
 scale=0.0001
 pow=0.75
 [rnorm2a]
 scale=0.0001
 pow=0.75
 [rnorm2b]
 scale=0.0001
 pow=0.75
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is a hybrid of 131 and 132: so it's like 120, but has communication in conv4 instead of conv3, and it also puts rnorm1 directly over conv1
 # on guppy7
 # logs/layers-133.log
 # /nobackup/kriz/tmp/ConvNet__2012-08-15_16.08.23
 # epoch 21: set epsw to 0.001 from 0.01
 # epoch 48: set epsw to 0.0001 from 0.001
 # epoch 50: killed, worse than 131
--- a/layers/layer-params-134.cfg
+++ b/layers/layer-params-134.cfg
@ -0,0 +1,169 @@
 [conv1a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 minDiv=0.25
 [rnorm1b]
 scale=0.0001
 pow=0.75
 minDiv=0.25
 [rnorm2a]
 scale=0.0001
 pow=0.75
 minDiv=0.25
 [rnorm2b]
 scale=0.0001
 pow=0.75
 minDiv=0.25
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #131, but with minDiv of 0.25 on rnorms
 # on guppy9
 # /nobackup/kriz/tmp/ConvNet__2012-08-20_23.26.41
 # logs/layers-134.log
 # epoch 13: on hold
--- a/layers/layer-params-135-2009-2012.cfg
+++ b/layers/layer-params-135-2009-2012.cfg
@ -0,0 +1,199 @@
 [conv1a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm1b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this trains 135 on 2012, initialized from 2009 1-8800
 # on guppy9
 # init epsw 0.001
 # logs/layers-135-2012-pretrain-2009.log
 # /nobackup/kriz/tmp/ConvNet__2012-09-09_15.20.47
 # epoch 22: set epsw to 0.0001 from 0.001
 # epoch 23: putting on hold to train softmax tree
 #           this is doing worse than 141-2009 anyway, which has an extra 6th conv layer (1.97 vs 2.00)
 # 135 notes:
 # this is like #131, but with minDiv of 2 on rnorms
 # on guppy8
 # /nobackup/kriz/tmp/ConvNet__2012-08-21_01.49.23
 # logs/layers-135.log
 # epoch 20: set epsw to 0.001 from 0.01
 # epoch 47: set epsw to 0.0001 from 0.001
 # epoch 66: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 75: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 81: set epsw to 0.00001 from 0.0001
 # epoch 96: killed
 # validation multiview: 
 # logprob:  1.757653, 0.410700, 0.184160 
 # now let's train on 2009 1-8800
 # logs/layers-135-2009-bigtrain.log
 # on guppy9
 # /nobackup/kriz/tmp/ConvNet__2012-08-26_22.39.45
 # epoch 4.7822: set epsw to 0.001 from 0.01
 # epoch 8.1299: set epsw to 0.0001 from 0.001
 # epoch 10.3697: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 11.4731: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 14.3906: set epsw to 0.00001 from 0.0001
 # epoch 17: killed
--- a/layers/layer-params-135-2009.cfg
+++ b/layers/layer-params-135-2009.cfg
@ -0,0 +1,189 @@
 [conv1a]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm1b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #131, but with minDiv of 2 on rnorms
 # on guppy8
 # /nobackup/kriz/tmp/ConvNet__2012-08-21_01.49.23
 # logs/layers-135.log
 # epoch 20: set epsw to 0.001 from 0.01
 # epoch 47: set epsw to 0.0001 from 0.001
 # epoch 66: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 75: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 81: set epsw to 0.00001 from 0.0001
 # epoch 96: killed
 # validation multiview: 
 # logprob:  1.757653, 0.410700, 0.184160 
 # now let's train on 2009 1-8800
 # logs/layers-135-2009-bigtrain.log
 # on guppy9
 # /nobackup/kriz/tmp/ConvNet__2012-08-26_22.39.45
 # epoch 4.7822: set epsw to 0.001 from 0.01
 # epoch 8.1299: set epsw to 0.0001 from 0.001
 # epoch 10.3697: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 11.4731: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 14.3906: set epsw to 0.00001 from 0.0001
 # epoch 17: killed
--- a/layers/layer-params-135.cfg
+++ b/layers/layer-params-135.cfg
@ -0,0 +1,177 @@
 [conv1a]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm1b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #131, but with minDiv of 2 on rnorms
 # on guppy8
 # /nobackup/kriz/tmp/ConvNet__2012-08-21_01.49.23
 # logs/layers-135.log
 # epoch 20: set epsw to 0.001 from 0.01
 # epoch 47: set epsw to 0.0001 from 0.001
 # epoch 66: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 75: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 81: set epsw to 0.00001 from 0.0001
 # epoch 96: killed
 # validation multiview: 
 # logprob:  1.757653, 0.410700, 0.184160 
--- a/layers/layer-params-136.cfg
+++ b/layers/layer-params-136.cfg
@ -0,0 +1,169 @@
 [conv1a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.01
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.01,0.01
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm1b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #135 (so uses def file 135), but subtracts scalar mean
 # on guppy7
 # logs/layers-136.log
 # /nobackup/kriz/tmp/ConvNet__2012-08-23_04.38.51
 # epoch 15: eh, this is no better, and has no reason to be better. screw it.
--- a/layers/layer-params-137-tree.cfg
+++ b/layers/layer-params-137-tree.cfg
@ -0,0 +1,196 @@
 [conv1a]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.001,0.001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm1b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like 137, but with treefc
 # on guppy9
 # init epsw 0.01 -- this run does not sale epsw by node size
 # /nobackup/kriz/tmp/ConvNet__2012-09-10_22.47.57
 # logs/layers-137-tree.log
 # epoch 14: set epsw to 0.001 from 0.01
 # epoch 38: killed..its stuck at 2.17 nats.. should be nearer to 2.06. perhaps resume later
 # 137 notes:
 # this is like #135, but changes the cnorm layers to rnorm
 # on guppy8
 # logs/layers-137.log
 # /nobackup/kriz/tmp/ConvNet__2012-08-25_05.39.04
 # epoch 26: set epsw to 0.001 from 0.01
 # epoch 50: set epsw to 0.0001 from 0.001
 # epoch 75: set epsw to 0 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 84: set epsw to 0.00001 from 0.0001
 # epoch 92: made backup to /nobackup/kriz/tmp/ConvNet__2012-08-25_05.39.04.bak
 #           set epsw to 0.0001 from 0.00001 (conv1/2 still 0)
 #           using BRIGHTNESS NOISE of 0.2 (in other words i zeroed out the other components of the color noise)
 # epoch 101: set color (brightness) noise to 0 from 0.2
 # epoch 105: set epsw to 0.00001 from 0.0001
 # experiment a failure. going back to training /nobackup/kriz/tmp/ConvNet__2012-08-25_05.39.04.bak
 # epoch 99: killed
 # logprob:  1.751138, 0.407820, 0.183440 
 # batch size 128 x 8:
 # /nobackup/kriz/tmp/ConvNet__2012-09-07_17.08.47
 # epoch 25: set epsw to 0.001 from 0.01
 #           made backup to /nobackup/kriz/tmp/ConvNet__2012-09-07_17.08.47.bak
 # epoch 34: killed, its not good
--- a/layers/layer-params-137.cfg
+++ b/layers/layer-params-137.cfg
@ -0,0 +1,207 @@
 [conv1a]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm1b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #135, but changes the cnorm layers to rnorm
 # on lsvrc-2010:
 # logs/layers-137-2010.log
 # on guppy9
 # /nobackup/kriz/tmp/ConvNet__2012-09-13_02.47.12
 # epoch 25: set epsw to 0.001 from 0.01
 # epoch 49: set epsw to 0.0001 from 0.001
 # epoch 81: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 85: set epsw to 0 from 0.00001 on conv1,conv2
 #           set epsw to 0.00001 from 0.0001 elsewhere 
 # epoch 103: killed
 # validation:
 # logprob:  1.727592, 0.394153, 0.182784 
 # validation multiview:
 # logprob:  1.632875, 0.377960, 0.171020 
 # test multiview:
 # logprob:  1.623185, 0.376167, 0.171247 
 # on lsvrc-2012:
 # on guppy8
 # logs/layers-137.log
 # /nobackup/kriz/tmp/ConvNet__2012-08-25_05.39.04
 # epoch 26: set epsw to 0.001 from 0.01
 # epoch 50: set epsw to 0.0001 from 0.001
 # epoch 75: set epsw to 0 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 84: set epsw to 0.00001 from 0.0001
 # epoch 92: made backup to /nobackup/kriz/tmp/ConvNet__2012-08-25_05.39.04.bak
 #           set epsw to 0.0001 from 0.00001 (conv1/2 still 0)
 #           using BRIGHTNESS NOISE of 0.2 (in other words i zeroed out the other components of the color noise)
 # epoch 101: set color (brightness) noise to 0 from 0.2
 # epoch 105: set epsw to 0.00001 from 0.0001
 # experiment a failure. going back to training /nobackup/kriz/tmp/ConvNet__2012-08-25_05.39.04.bak
 # epoch 99: killed
 # logprob:  1.751138, 0.407820, 0.183440 
 # batch size 128 x 8:
 # /nobackup/kriz/tmp/ConvNet__2012-09-07_17.08.47
 # epoch 25: set epsw to 0.001 from 0.01
 #           made backup to /nobackup/kriz/tmp/ConvNet__2012-09-07_17.08.47.bak
 # epoch 34: killed, its not good
--- a/layers/layer-params-139.cfg
+++ b/layers/layer-params-139.cfg
@ -0,0 +1,172 @@
 [conv1a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv2b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 wcNormMin=0.001,0
 wcNormMax=0.002,0
 [conv3a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm1b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #137 (hence uses same file) but has wcnorm on conv2[0]
 # epoch 19: set epsw to 0.001 from 0.01
 # epoch 49: set epsw to 0.0001 from 0.001
 # epoch 62: killed, about 0.01 nat worse than 137 (which is pretty significant at this stage)
--- a/layers/layer-params-141-2009-half.cfg
+++ b/layers/layer-params-141-2009-half.cfg
@ -0,0 +1,203 @@
 [conv1a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.0001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv6a]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv6b]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048a]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.0001,0.0001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm1b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like 141, but trained on half of 2009 imgnet, to be comparable to google's results
 # logs/layers-141-2009-half.log
 # /nobackup/kriz/tmp/ConvNet__2012-09-09_00.26.31
 # on guppy9
 # epoch 6.2600: set epsw to 0.001 from 0.01
 # epoch 13.3361: set epsw to 0.0001 from 0.001
 # epoch 18.2396: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #                set color noise to 0 from 0.1
 # epoch 21.1949: set epsw to 0 from 0.00001 on conv1,conv2
 # epoch 25.3718: set epsw to 0.00001 from 0.0001
 # epoch 28.3271: killed
 # ok test erro rate is a bit worse than 131, restarting with epsw 0.001, color noise 0.1
 # epoch 44.183: set epsw to 0.0001 from 0.001
 # epoch 56: eek, it started getting worse on validation :/
 # 141 notes:
 # this is like #137 but with conv6, also communication in conv6
 # /nobackup/kriz/tmp/ConvNet__2012-09-03_16.27.48
 # logs/layers-141.log
 # epoch 23: set epsw to 0.001 from 0.01
 # epoch 48: set epsw to 0.0001 from 0.001
 # epoch 60: this seems overfitty....killing
 # but will use these weights to initialize a net on 2009... why the hell not?
--- a/layers/layer-params-141-2009.cfg
+++ b/layers/layer-params-141-2009.cfg
@ -0,0 +1,231 @@
 [conv1a]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0000,0.0000
 epsB=0.00
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv6a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv6b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm1b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # training on lsvrc-2010
 # initialized from 141 trained on lsvrc-2012, then 2009
 # using def file layers-141-2009-2010.cfg
 # /nobackup/kriz/tmp/ConvNet__2012-09-12_01.06.32
 # on guppy8
 # init epsw 0.001
 # logs/layers-141-2010-pretrain-2009-pretrain-2012.log
 # epoch 14: set epsw to 0.0001 from 0.001
 # epoch 30: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 36: set epsw to 0 on conv1/2
 # epoch 47: set epsw to 0.00001 from 0.0001
 # epoch 54: killed
 # logprob:  1.511725, 0.356707, 0.154893 
 # training on lsvrc-2012
 # initialized from 141 trained on lsvrc-2012, then 2009
 # using def file layers-141-2009-2012.cfg
 # init epsw 0.001
 # logs/layers-141-2012-pretrain-2009-pretrain-2012.log
 # /nobackup/kriz/tmp/ConvNet__2012-09-09_03.36.13
 # backup: /ais/gobi3/u/kriz/tmp/ConvNet__2012-09-09_03.36.13
 #         also /ais/gobi3/u/kriz/net-backups/
 # on guppy8
 # epoch 13: set epsw to 0.0001 from 0.001
 # epoch 26: set epsw to 0.00001 from 0.0001 on conv1,conv2
 #           set color noise to 0 from 0.1
 # epoch 32: set epsw to 0 on conv1/2
 # epoch 43: set epsw to 0.00001 from 0.0001
 # epoch 54: killed
 # python convnet.py -f /nobackup/kriz/tmp/ConvNet__2012-09-09_03.36.13 --test-only=1 --test-one=0 --multiview-test=1
 # logprob:  1.671316, 0.395620, 0.172060 
 #python convnet.py -f /nobackup/kriz/tmp/ConvNet__2012-09-09_03.36.13 --test-only=1 --test-one=0 --multiview-test=0
 # logprob:  1.779082, 0.415920, 0.186780 
 # 141-2009 notes, before going back to 2012:
 # initialized from 141 trained on lsvrc-2012
 # init epsw 0.001
 # logs/layers-141-2009-pretrain-2012.log
 # /nobackup/kriz/tmp/ConvNet__2012-09-07_05.22.51
 # epoch 4.1189: set epsw to 0.0001 from 0.001
 # epoch 5.1596: killed, not improving much. lets go back to training on lsvrc-2012 with these weights now.
 # 
 # 141 notes:
 # this is like #137 but with conv6, also communication in conv6
 # /nobackup/kriz/tmp/ConvNet__2012-09-03_16.27.48
 # logs/layers-141.log
 # epoch 23: set epsw to 0.001 from 0.01
 # epoch 48: set epsw to 0.0001 from 0.001
 # epoch 60: this seems overfitty....killing
 # but will use these weights to initialize a net on 2009... why the hell not?
--- a/layers/layer-params-141.cfg
+++ b/layers/layer-params-141.cfg
@ -0,0 +1,187 @@
 [conv1a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv2b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0.00,0.00
 [conv3a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.0001
 epsB=0.02
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv6a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv6b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048a]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.0001,0.0001
 epsB=0.02
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm1b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #137 but with conv6, also communication in conv6
 # /nobackup/kriz/tmp/ConvNet__2012-09-03_16.27.48
 # logs/layers-141.log
 # epoch 23: set epsw to 0.001 from 0.01
 # epoch 48: set epsw to 0.0001 from 0.001
 # epoch 60: this seems overfitty....killing
 # but will use these weights to initialize a net on 2009... why the hell not?
--- a/layers/layer-params-145-2010.cfg
+++ b/layers/layer-params-145-2010.cfg
@ -0,0 +1,206 @@
 [conv1a]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv1b]
 epsW=0.0000
 epsB=0.00
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv2b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0.00
 [conv3a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv3b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [conv4a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv4b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5a]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [conv5b]
 epsW=0.00001
 epsB=0.002
 momW=0.9
 momB=0.9
 wc=0.0005
 wball=0
 [fc2048a]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048b]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048ba]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc2048bb]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [fc1000]
 epsW=0.00001,0.00001
 epsB=0.002
 momW=0.9,0.9
 momB=0.9
 wc=0.0005,0.0005
 wball=0,0
 [logprob]
 coeff=1
 topk=5
 [hs1a]
 enable=true
 [hs2a]
 enable=true
 [hs1b]
 enable=true
 [hs2b]
 enable=true
 [rnorm1a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm1b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2a]
 scale=0.0001
 pow=0.75
 minDiv=2
 [rnorm2b]
 scale=0.0001
 pow=0.75
 minDiv=2
 [cnorm2a]
 scale=0.001
 pow=0.75
 [cnorm2b]
 scale=0.001
 pow=0.75
 # this is like #137 but without low-res stuff
 # on lsvrc-2010:
 # guppy9
 # logs/layers-145-2010.log
 # /nobackup/kriz/tmp/ConvNet__2012-09-27_12.39.44
 # epoch 23: set epsw to 0.001 from 0.01
 # epoch 51: set epsw to 0.0001 from 0.001
 # epoch 68: set epsw to 0.00001 from 0.0001 on conv1
 #           set color noise to 0 from 0.1
 # epoch 72: set epsw to 0 on conv1
 # epoch 78: set epsw to 0.00001 from 0.0001
 # epoch 93: killed
 # test multliview:
 # logprob:  1.614660, 0.374727, 0.169987 
 # test center patch:
 # logprob:  1.706031, 0.390247, 0.182953 (NOTE, NOT MULTIVIEW!!)
 # on gpu (now guppy8)
 # logs/layers-145.log
 # /storage/tmp/ConvNet__2012-09-13_03.43.56
 # epoch 25: set epsw to 0.001 from 0.01
 # epoch 36: paused for localization experiments
 # resuming on guppy9
 # logs/layers-145-cont.log
 # /nobackup/kriz/tmp/ConvNet__2012-09-13_03.43.56
 # epoch 51: set epsw to 0.0001 from 0.001
 # epoch 58: paused for imgnet-20k experiments
 # moved to guppy8
 # epoch 67: set epsw to 0.00001 from 0.0001 on conv1
 #           set color noise to 0 from 0.1
 # epoch 72: set epsw to 0 on conv1
 # epoch 79: set epsw to 0.00001 from 0.0001
 # epoch 91: killed
 # logprob:  1.741473, 0.406640, 0.182100 
 # on 2012-full:
 # on guppy7
 # logs/layers-145-full.log
 # /nobackup/kriz/tmp/ConvNet__2012-09-23_19.38.45
 # epoch 19: set epsw to 0.001 from 0.01
 # epoch 47: set epsw to 0.0001 from 0.001
 # epoch 61: moved to gpu
--- a/Show more
+++ b/Show more
		`@ -0,0 +1 @@`
							`External contributions are not accepted, sorry!`
		`@ -0,0 +1,2 @@`
							`deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 4.2, CUDA Runtime Version = 4.2, NumDevs = 4, Device = Tesla S2050, Device = Tesla S2050`
							`deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 4.2, CUDA Runtime Version = 4.2, NumDevs = 4, Device = Tesla S2050, Device = Tesla S2050`