Initial commit

2024-12-10 08:56:11 -08:00 · 2024-12-10 08:56:11 -08:00 · 9fdd561586
commit 9fdd561586
246 changed files with 58283 additions and 0 deletions
--- a/src/convnet.cu
+++ b/src/convnet.cu
@ -0,0 +1,594 @@
+/* 
+ * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <vector>
+#include <iostream> 
+#include <string>
+#include <set>
+#include <map>
+
+#include <nvmatrix.cuh>
+#include <nvmatrix_operators.cuh>
+#include <matrix.h>
+#include <convnet.cuh>
+#include <util.cuh>
+
+using namespace std;
+
+/* 
+ * =======================
+ * ConvNet
+ * =======================
+ */
+ConvNet::ConvNet(PyObject* layerParams, intv& deviceIDs, vector<intv*>& deviceCPUs, int minibatchSize, int weightUpdateFreq) : Thread(false) {
+    _weightUpdateFreq = weightUpdateFreq;
+    _numBwdMiniPasses = 0;
+    _deviceIDs = &deviceIDs;
+    _deviceCPUs = &deviceCPUs;
+    _data = NULL;
+    _trainingProgress = 0;
+    _sync = new ThreadSynchronizer(deviceIDs.size() + 1);
+    seti pipeSet;
+    pipeSet.insert(deviceIDs.begin(), deviceIDs.end());
+    _pd = new PipeDispenserNonBlocking(pipeSet);
+    PyObject* layerList = PyDict_Values(layerParams);
+    
+    // Data layers live on the manager thread (in CPU memory)
+    for (int i = 0; i < PyList_GET_SIZE(layerList); i++) {
+        PyObject* paramsDict = PyList_GET_ITEM(layerList, i);
+        string layerType = pyDictGetString(paramsDict, "type");
+        if (layerType == "data") {
+            DataLayer* d = new DataLayer(NULL, paramsDict);
+            _dataLayers.push_back(d);
+            _layerMap[d->getName()] = d;
+        }
+    }
+    
+    // Initialize GPU worker threads
+    for (int d = 0; d < deviceIDs.size(); ++d) {
+        ConvNetGPU* cng = new ConvNetGPU(layerList, deviceIDs[d], *deviceCPUs[d], this);
+        
+        _convNetThreads.push_back(cng);
+        for (map<string, Layer*>::iterator it = cng->getLayerMap().begin(); it != cng->getLayerMap().end(); ++it) {
+            _layerMap[it->first] = it->second;
+        }
+    }
+    // Connect forward/backward links in graph
+    for (map<string, Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+        PyObject* paramsDict = PyDict_GetItemString(layerParams, it->first.c_str());
+        PyObject* inputList = PyDict_GetItemString(paramsDict, "inputs");
+        if (inputList != NULL) {
+            for (int i = 0; i < PyList_GET_SIZE(inputList); i++) {
+                string inputName = PyString_AsString(PyList_GetItem(inputList, i));
+                it->second->addPrev(_layerMap[inputName]);
+                _layerMap[inputName]->addNext(it->second);
+            }
+        }
+    }
+    
+    _numFwdTerminal = 0;
+    // Execute post-initialization stuff
+    for (map<string, Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+        it->second->postInit();
+        _numFwdTerminal += it->second->getNext().size() == 0; // Number of terminal nodes going forward
+    }
+    // Find and count the terminal nodes in the backward pass
+    set<string> visited, terminal;
+    for (int t = 0; t < _convNetThreads.size(); t++) {
+        vector<CostLayer*>& cl = _convNetThreads[t]->getCostLayers();
+        for (int c = 0; c < cl.size(); c++) {
+            findBwdTerminal(*cl[c], visited, terminal);
+        }
+    }
+    _numBwdTerminal = terminal.size();
+//    printf("num fwd terminals: %d, back terminals:\n", _numFwdTerminal);
+//    for (set<string>::iterator it = terminal.begin(); it != terminal.end(); ++it) {
+//        printf("%s\n", (*it).c_str());
+//    }
+    _dp = new DataProvider(minibatchSize);
+
+    Py_DECREF(layerList);
+    assert(_weightUpdateFreq > 0);
+}
+
+void ConvNet::findBwdTerminal(Layer& l, set<string>& visited, set<string> &terminal) {
+    if (visited.count(l.getName()) == 0) { 
+        visited.insert(l.getName());
+        if (l.isGradConsumer()) {
+            bool hasPrevConsumer = false;
+            for (int i = 0; i < l.getPrev().size(); i++) {
+                hasPrevConsumer |= l.getPrev()[i]->isGradConsumer();
+            }
+            if (!hasPrevConsumer || !l.isGradProducer()) {
+                terminal.insert(l.getName());
+                l.setBwdTerminal(true);
+            } else if (l.isGradProducer()) {
+                for (int i = 0; i < l.getPrev().size(); i++) {
+                    if (l.getPrev()[i]->isGradConsumer()) {
+                        findBwdTerminal(*l.getPrev()[i], visited, terminal);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void* ConvNet::run() {
+    // The manager thread defaults to using the GPU of the first worker.
+    // Put more logic here if this is inappropriate.
+    NVMatrix::setDeviceID(_convNetThreads[0]->getDeviceID());
+    for (int t = 0; t < _convNetThreads.size(); t++) {
+        _convNetThreads[t]->start();
+    }
+    copyToGPU();
+    while (true) {
+        Worker* worker = _workerQueue.dequeue();
+        worker->run();
+        delete worker;
+    }
+    return NULL;
+}
+
+Queue<Worker*>& ConvNet::getWorkerQueue() {
+    return _workerQueue;
+}
+
+Queue<WorkResult*>& ConvNet::getResultQueue() {
+    return _resultQueue;
+}
+
+DataProvider& ConvNet::getDataProvider() {
+    return *_dp;
+}
+
+Layer& ConvNet::operator[](string& name) {
+    return *_layerMap[name];
+}
+
+Layer& ConvNet::getLayer(string& name) {
+    return *_layerMap[name];
+}
+
+void ConvNet::sendMessage(MESSAGES msg, bool sync) {
+    for (int i = 0; i < _convNetThreads.size(); i++) {
+        _convNetThreads[i]->enqueueMessage(new Message(msg));
+        if (sync) {
+            _convNetThreads[i]->enqueueMessage(new Message(SYNC));
+        }
+    }
+    
+    if  (sync) {
+        _sync->sync();
+    }
+}
+
+void ConvNet::copyToCPU() {
+    sendMessage(COPY_TO_CPU, true);
+}
+
+void ConvNet::copyToGPU() {
+    sendMessage(COPY_TO_GPU, false);
+}
+
+void ConvNet::updateWeights() {
+    sendMessage(UPDATE_WEIGHTS, true);
+}
+
+void ConvNet::reset() {
+    sendMessage(RESET, false);
+}
+
+void ConvNet::fprop(PASS_TYPE passType) {
+    assert(_data != NULL);
+    reset();
+    for (int i = 0; i < _dataLayers.size(); i++) {
+        _dataLayers[i]->startFprop(*_data, passType);
+    }
+    waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
+}
+
+void ConvNet::fprop(CPUData& data, PASS_TYPE passType) {
+    if (&data != _data) {
+        delete _data;
+    }
+    _data = &data;
+    fprop(passType);
+}
+
+void ConvNet::fprop(int miniIdx, PASS_TYPE passType) {
+    delete _data;
+    reset();
+    if (miniIdx == 0 || miniIdx != _dataLayers[0]->getBufferMinibatchIdx()) {
+        _data = &_dp->getMinibatch(miniIdx);
+        for (int i = 0; i < _dataLayers.size(); i++) {
+            _dataLayers[i]->startFprop(*_data, passType);
+        }
+    } else {
+        _data = _dataLayers[0]->getBufferData();
+        for (int i = 0; i < _dataLayers.size(); i++) {
+            _dataLayers[i]->startFpropFromBuffer(passType);
+        }
+    }
+    CPUData* nextData = miniIdx + 1 == _dp->getNumMinibatches() ? NULL : &_dp->getMinibatch(miniIdx + 1);
+    if (nextData != NULL) {
+        for (int i = 0; i < _dataLayers.size(); i++) {
+            _dataLayers[i]->setBuffer(*nextData, miniIdx + 1);
+        }
+    }
+    waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
+}
+
+void ConvNet::bprop(PASS_TYPE passType) {
+    // Weights are updated when this is zero
+    _numBwdMiniPasses = (_numBwdMiniPasses + 1) % _weightUpdateFreq;
+    for (int i = 0; i < _convNetThreads.size(); i++) {
+        _convNetThreads[i]->enqueueMessage(new BpropStartMessage(passType));;
+    }
+    waitForTerminals(_numBwdTerminal, BPROP_TERMINAL);
+    reset();
+}
+
+void ConvNet::waitForTerminals(int numMsgs, MESSAGES msg) {
+    int terminalsDone = 0;
+    while(terminalsDone++ < numMsgs) {
+        Message* m = _msgQueue.dequeue();
+        assert(m->getMessageType() == msg);
+        delete m;
+    }
+}
+
+// Same as getCost() but adds results to given cost and returns it
+Cost& ConvNet::getCost(Cost& cost) {
+    Cost &tmp = getCost();
+    cost += tmp;
+    delete &tmp;
+    return cost;
+}
+
+Cost& ConvNet::getCost() {
+    Cost &tmp = *new Cost(_data->getNumCases());
+    for (int i = 0; i < _convNetThreads.size(); i++) {
+        Cost& tmp2 = _convNetThreads[i]->getCost(_data->getNumCases());
+        tmp |= tmp2;
+        delete &tmp2;
+    }
+    return tmp;
+}
+
+double ConvNet::getCostValue() {
+    Cost& cost = getCost();
+    double val = cost.getValue();
+    delete &cost;
+    return val;
+}
+
+Queue<Message*>& ConvNet::getMessageQueue() {
+    return _msgQueue;
+}
+
+int ConvNet::getDeviceID(int gpuIdx) {
+    if (gpuIdx < 0) {
+        return -1;
+    }
+    return _deviceIDs->at(gpuIdx);
+}
+
+intv& ConvNet::getDeviceIDs() {
+    return *_deviceIDs;
+}
+
+ThreadSynchronizer& ConvNet::getSync() {
+    return *_sync;
+}
+
+PipeDispenser& ConvNet::getPipeDispenser() {
+    return *_pd;
+}
+
+void ConvNet::syncWithChildren() {
+    sendMessage(SYNC, false);
+    _sync->sync();
+}
+
+int ConvNet::getWeightUpdateFreq() {
+    return _weightUpdateFreq;
+}
+
+int ConvNet::getNumBwdMiniPasses() {
+    return _numBwdMiniPasses;
+}
+
+int ConvNet::getMinibatchSize() {
+    return _dp->getMinibatchSize();
+}
+
+void ConvNet::setTrainingProgress(double progress) {
+	_trainingProgress = progress;
+}
+
+double ConvNet::getTrainingProgress() const {
+	return _trainingProgress;
+}
+
+/*
+ * Gradient checking stuff
+ */
+void ConvNet::checkGradients() {
+    _numFailures = 0;
+    _numTests = 0;
+    fprop(0, PASS_GC);
+    _baseErr = getCostValue();
+    bprop(PASS_GC);
+    
+    for (map<string, Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+        if (it->second->getDeviceID() >= 0) {
+            NVMatrix::setDeviceID(it->second->getDeviceID());
+            it->second->checkGradients();
+        }
+    }
+    NVMatrix::setDeviceID(_convNetThreads[0]->getDeviceID());
+    
+    cout << "------------------------" << endl;
+    if (_numFailures > 0) {
+        cout << _numFailures << "/" << _numTests << " TESTS FAILED" << endl;
+    } else {
+        cout << "ALL " << _numTests << " TESTS PASSED" << endl;
+    }
+}
+
+/*
+ * name: weight matrix name
+ * eps: finite difference step
+ */
+bool ConvNet::checkGradient(const string& name, float eps, Weights& weights) {
+    Matrix numGrad(weights.getNumRows(), weights.getNumCols());
+    Matrix diff(numGrad);
+    numGrad.apply(Matrix::ZERO);
+    Matrix weightsCPU;
+
+    weights.getW().copyToHost(weightsCPU, true);
+
+    for(int i = 0; i < weights.getNumRows(); i++) {
+        for (int j = 0; j < weights.getNumCols(); j++) {
+            float v = weightsCPU(i,j);
+            weightsCPU(i,j) += eps;
+            weights.getW().copyFromHost(weightsCPU);
+            weightsCPU(i,j) = v;
+            fprop(PASS_GC);
+            double err = getCostValue();
+            numGrad(i,j) = (err - _baseErr) / (_data->getNumCases() * eps);
+            if (isnan(numGrad(i,j)) || isinf(numGrad(i,j))) {
+                cout << "Numerical computation produced nan or inf when checking '" << name << "': " << numGrad(i,j) << endl;
+                cout << "Consider reducing the sizes of the weights or finite difference steps." << endl;
+                cout << "Exiting." << endl;
+                exit(1);
+            }
+            weights.getW().copyFromHost(weightsCPU);
+        }
+    }
+    Matrix gradCPU;
+    weights.getGrad().copyToHost(gradCPU, true);
+    gradCPU.scale(-1.0 / _data->getNumCases());
+    float analNorm = gradCPU.norm();
+    float numNorm = numGrad.norm();
+    numGrad.subtract(gradCPU, diff);
+    float relErr = diff.norm() / analNorm;
+    bool fail = relErr >= GC_REL_ERR_THRESH;
+    if (fail || !GC_SUPPRESS_PASSES) {
+        cout << "========================" << endl;
+        printf("(%s) %s GRADIENT CHECK\n", fail ? "****FAIL****" : "PASS", name.c_str());
+        cout << "========================" << endl;
+        cout << "Analytic:" << endl;
+        gradCPU.print(0, 6, 0, 4);
+        cout << "Numeric:" << endl;
+        numGrad.print(0, 6, 0, 4);
+        printf("Analytic norm: %e\n", analNorm);
+        printf("Numeric norm:  %e\n", numNorm);
+        printf("Relative error: %e\n", relErr);
+    }
+    _numTests++;
+    _numFailures += fail;
+    return fail;
+}
+
+/* 
+ * =======================
+ * ConvNetGPU
+ * =======================
+ */
+ConvNetGPU::ConvNetGPU(PyObject* layerList, int deviceID, intv& deviceCPUs, ConvNet* convNet)
+    : Thread(false, deviceCPUs), _deviceID(deviceID), _convNet(convNet) {
+    try {
+        int numLayers = PyList_GET_SIZE(layerList);
+
+        for (int i = 0; i < numLayers; i++) {
+            PyObject* paramsDict = PyList_GET_ITEM(layerList, i);
+            int layerDeviceID = convNet->getDeviceID(pyDictGetInt(paramsDict, "gpu"));
+            if (layerDeviceID == _deviceID) {
+                initLayer(paramsDict);
+            }
+        }
+    } catch (string& s) {
+        cout << "Error creating ConvNet: " << s << endl;
+        exit(1);
+    }
+}
+
+void ConvNetGPU::initLayer(PyObject* paramsDict) {
+    string type = pyDictGetString(paramsDict, "type");
+    string name = pyDictGetString(paramsDict, "name");
+    if (type == "fc") {
+        _layerMap[name] = new FCLayer(this, paramsDict, false, true);
+    } else if (type == "treefc") {
+        _layerMap[name] = new TreeFCLayer(this, paramsDict);
+    } else if (type == "conv") {
+        _layerMap[name] = new ConvLayer(this, paramsDict);
+    } else if (type == "local") {
+        _layerMap[name] = new LocalUnsharedLayer(this, paramsDict);
+    } else if (type == "pool") {
+        _layerMap[name] = &PoolLayer::makePoolLayer(this, paramsDict);
+    } else if (type == "rnorm") {
+        _layerMap[name] = new ResponseNormLayer(this, paramsDict);
+    } else if (type == "cmrnorm") {
+        _layerMap[name] = new CrossMapResponseNormLayer(this, paramsDict);
+    } else if (type == "cnorm") {
+        _layerMap[name] = new ContrastNormLayer(this, paramsDict);
+    } else if (type == "softmax") {
+        _layerMap[name] = new SoftmaxLayer(this, paramsDict);
+    } else if (type == "eltsum") {
+        _layerMap[name] = new EltwiseSumLayer(this, paramsDict);
+    } else if (type == "eltmax") {
+        _layerMap[name] = new EltwiseMaxLayer(this, paramsDict);
+    } else if (type == "neuron") {
+        _layerMap[name] = new NeuronLayer(this, paramsDict);
+    } else if (type == "nailbed") {
+        _layerMap[name] = new NailbedLayer(this, paramsDict);
+    } else if (type == "blur") {
+        _layerMap[name] = new GaussianBlurLayer(this, paramsDict);
+    } else if (type == "href") {
+        _layerMap[name] = new HorizontalReflectionLayer(this, paramsDict);
+    } else if (type == "resize") {
+        _layerMap[name] = new ResizeLayer(this, paramsDict);
+    } else if (type == "rgb2yuv") {
+        _layerMap[name] = new RGBToYUVLayer(this, paramsDict);
+    } else if (type == "rgb2lab") {
+        _layerMap[name] = new RGBToLABLayer(this, paramsDict);
+    } else if (type == "rscale") {
+        _layerMap[name] = new RandomScaleLayer(this, paramsDict);
+    } else if (type == "concat") {
+        _layerMap[name] = new ConcatenationLayer(this, paramsDict);
+    } else if (type == "hs") {
+        _layerMap[name] = new HiddenSexLayer(this, paramsDict);
+    } else if (strncmp(type.c_str(), "cost.", 5) == 0) {
+        CostLayer *c = &CostLayer::makeCostLayer(this, type, paramsDict);
+        _layerMap[name] = c;
+        _costs.push_back(c);
+    } else {
+        throw string("Unknown layer type ") + type;
+    }
+}
+
+/*
+ * This executes in a new CPU thread so it's OK to initialize CUDA stuff here. 
+ */
+void ConvNetGPU::initCuda() { 
+    NVMatrix::setDeviceID(_deviceID);
+    checkCudaErrors(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
+    for (int i = 0; i < _convNet->getDeviceIDs().size(); i++) {
+        int d = _convNet->getDeviceID(i);
+        if (d != _deviceID) {
+            if (NVMatrix::canAccessDevice(_deviceID, d)) {
+                printf("Enabling peer access %d --> %d\n", NVMatrix::getDeviceID(), d);
+                checkCudaErrors(cudaDeviceEnablePeerAccess(d, 0));
+            } else {
+                printf("No peer access %d --> %d\n", _deviceID, d);
+            }
+        }
+    }
+    NVMatrix::initCublas();
+    NVMatrix::initRandom();
+    srand(time(0));
+}
+
+void* ConvNetGPU::run() {
+    initCuda();
+
+    while (true) {
+        Message* m = _msgQueue.dequeue();
+        if (m->getMessageType() == FPROP_READY) {
+            FpropMessage* msg = static_cast<FpropMessage*>(m);
+            _layerMap[msg->getToLayer()]->fprop(msg->getPassType());
+        } else if (m->getMessageType() == BPROP_READY) {
+            BpropMessage* msg = static_cast<BpropMessage*>(m);
+            _layerMap[msg->getToLayer()]->incRcvdBInputMsgs();
+            _layerMap[msg->getToLayer()]->bprop(msg->getPassType());
+        } else if (m->getMessageType() == BPROP_START) {
+            BpropStartMessage* msg = static_cast<BpropStartMessage*>(m);
+            for (int i = 0; i < _costs.size(); i++) {
+                dynamic_cast<Layer*>(_costs[i])->bprop(msg->getPassType());
+            }
+        } else if (m->getMessageType() == SYNC) {
+            _convNet->getSync().sync();
+        } else if (m->getMessageType() == COPY_TO_CPU) {
+            for (map<string,Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+                it->second->copyToCPU();
+            }
+        } else if (m->getMessageType() == COPY_TO_GPU) {
+            for (map<string,Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+                it->second->copyToGPU();
+            }
+        } else if (m->getMessageType() == RESET) {
+            for (map<string,Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+                it->second->reset();
+            }
+        } else if (m->getMessageType() == UPDATE_WEIGHTS) {
+            for (map<string,Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
+                it->second->updateWeights();
+            }
+        } else if (m->getMessageType() == RUNME) {
+            RunMeMessage* msg = static_cast<RunMeMessage*>(m);
+            msg->run();
+        }
+        delete m;
+    }
+    return NULL;
+}
+
+Cost& ConvNetGPU::getCost(int numCases) {
+    return *new Cost(numCases, _costs);
+}
+
+Layer& ConvNetGPU::operator[](string& name) {
+    return *_layerMap[name];
+}
+
+Layer& ConvNetGPU::getLayer(string& name) {
+    return *_layerMap[name];
+}
+
+int ConvNetGPU::getDeviceID() {
+    return _deviceID;
+}
+
+Queue<Message*>& ConvNetGPU::getMessageQueue() {
+    return _msgQueue;
+}
+
+void ConvNetGPU::enqueueMessage(Message* msg) {
+    getMessageQueue().enqueue(msg);
+}
+
+vector<CostLayer*>& ConvNetGPU::getCostLayers() {
+    return _costs;
+}
+
+map<string, Layer*>& ConvNetGPU::getLayerMap() {
+    return _layerMap;
+}
+
+ConvNet& ConvNetGPU::getConvNet() {
+    return *_convNet;
+}
--- a/src/cost.cu
+++ b/src/cost.cu
@ -0,0 +1,126 @@
+/* 
+ * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <iostream>
+#include <cost.cuh>
+
+using namespace std;
+
+/* 
+ * =====================
+ * Cost
+ * =====================
+ */
+
+Cost::Cost(int numCases) : _numCases(numCases) {
+}
+
+Cost::Cost(int numCases, vector<CostLayer*>& costs) : _numCases(numCases) {
+    for (vector<CostLayer*>::iterator it = costs.begin(); it != costs.end(); ++it) {
+        _costMap[(*it)->getName()] = &(*it)->getCost();
+        _costCoeffMap[(*it)->getName()] = (*it)->getCoeff();
+    }
+}
+
+int Cost::getNumCases() {
+    return _numCases;
+}
+
+doublev& Cost::operator [](const string s) {
+    return *_costMap[s];
+}
+
+CostMap& Cost::getCostMap() {
+    return _costMap;
+}
+
+CostCoeffMap& Cost::getCostCoeffMap() {
+    return _costCoeffMap;
+}
+
+double Cost::getValue() {
+    double val = 0;
+    for (CostMap::iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
+        val += _costCoeffMap[it->first] * it->second->at(0);
+    }
+    return val;
+}
+
+Cost& Cost::operator += (Cost& er) {
+    CostMap& otherMap = er.getCostMap();
+    CostCoeffMap& otherCoeffMap = er.getCostCoeffMap();
+    for (CostMap::const_iterator it = otherMap.begin(); it != otherMap.end(); ++it) {
+        if (_costMap.count(it->first) == 0) {
+            _costMap[it->first] = new doublev();
+            _costCoeffMap[it->first] = otherCoeffMap[it->first];
+        }
+        
+        vector<double>& myVec = *_costMap[it->first];
+        vector<double>& otherVec = *otherMap[it->first];
+        assert(myVec.size() == 0 || myVec.size() == otherVec.size());
+        for (int i = 0; i < otherVec.size(); i++) {
+            if (myVec.size() <= i) {
+                myVec.push_back(0);
+            }
+            myVec[i] += otherVec[i];
+        }
+    }
+    _numCases += er.getNumCases();
+    return *this;
+}
+
+// Merge costs in er into this cost
+Cost& Cost::operator |= (Cost& er) {
+    assert(er.getNumCases() == getNumCases());
+    CostMap& otherMap = er.getCostMap();
+    CostCoeffMap& otherCoeffMap = er.getCostCoeffMap();
+    for (CostMap::const_iterator it = otherMap.begin(); it != otherMap.end(); ++it) {
+        assert(_costMap.count(it->first) == 0);
+
+        _costMap[it->first] = new doublev();
+        _costCoeffMap[it->first] = otherCoeffMap[it->first];
+
+        vector<double>& myVec = *_costMap[it->first];
+        vector<double>& otherVec = *otherMap[it->first];
+        myVec.insert(myVec.begin(), otherVec.begin(), otherVec.end());
+    }
+    return *this;
+}
+
+Cost& Cost::operator /= (const double v) {
+    for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
+        for (doublev::iterator it2 = it->second->begin(); it2 != it->second->end(); ++it2) {
+            *it2 /= v;
+        }
+    }
+    return *this;
+}
+
+Cost::~Cost() {
+    for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
+        delete it->second;
+    }
+}
--- a/src/cpuCNN.cu
+++ b/src/cpuCNN.cu
@ -0,0 +1,65 @@
+
+
+
+#include "softmaxtree.cuh"
+/*
+ * weights: (numNodes, numFeatures)
+ * targets: (numNodes, numFeatures)
+ * 
+ */
+void cpuSoftmaxTreeFwd(float* weights, float* targets, const int numFeatures, SoftmaxTree& tree) {
+    for (int d = 0; d <= tree.getDepth(); ++d) {
+        for (SoftmaxNodeV::iterator it = tree.getNodesAtDepth(d).begin(); it!= tree.getNodesAtDepth(d).end(); ++it) {
+            SoftmaxNode& node = **it;
+            SoftmaxNode* parent = node.getParent();
+            for (int f = 0; f < numFeatures; ++f) {
+                targets[node.getLabel() * numFeatures + f] = weights[node.getLabel() * numFeatures + f]
+                                                           + (parent == NULL ? 0 : targets[parent->getLabel() * numFeatures + f]);
+            }
+        }
+    }
+}
+
+/*
+ * grads:   (numNodes, numFeatures)
+ * 
+ */
+void cpuSoftmaxTreeBwd(float* grads, const int numFeatures, SoftmaxTree& tree) {
+    for (int h = 1; h <= tree.getHeight(); ++h) {
+        for (SoftmaxNodeV::iterator it = tree.getNodesAtHeight(h).begin(); it!= tree.getNodesAtHeight(h).end(); ++it) {
+            SoftmaxNode& node = **it;
+            for (int f = 0; f < numFeatures; ++f) {
+                grads[node.getLabel() * numFeatures + f] = 0;
+            }
+            for (SoftmaxNodeV::iterator itc = node.getChildren().begin(); itc!= node.getChildren().end(); ++itc) {
+                SoftmaxNode& child = **itc;
+                for (int f = 0; f < numFeatures; ++f) {
+                    grads[node.getLabel() * numFeatures + f] += grads[child.getLabel() * numFeatures + f];
+                }
+            }          
+        }
+    }
+}
+
+/*
+ * weights:     (numNodes, numFeatures)
+ * weightsInc:  (numNodes, numFeatures)
+ * weightsGrad: (numNodes, numFeatures)
+ * nodeSizes:   numNodes-array whose ith element gives number of leaves under
+ *              node with label i.
+ */
+void cpuSoftmaxTreeUpdateWeights(float* weights, float* weightsInc, float* weightsGrad,
+                                 const int numFeatures, float eps, const float mom, float wc, SoftmaxTree& tree) {
+    for (int d = 0; d <= tree.getDepth(); d++) {
+        for (SoftmaxNodeV::iterator it = tree.getNodesAtDepth(d).begin(); it!= tree.getNodesAtDepth(d).end(); ++it) {
+            SoftmaxNode& node = **it;
+            float w = wc / node.getSize();
+            float e = eps;// * sqrt(node.getSize());
+            for (int f = 0; f < numFeatures; ++f) {
+                weightsInc[node.getLabel() * numFeatures + f] = mom * weightsInc[node.getLabel() * numFeatures + f]
+                                                              + e * (weightsGrad[node.getLabel() * numFeatures + f] - w * weights[node.getLabel() * numFeatures + f]);
+                weights[node.getLabel() * numFeatures + f] += weightsInc[node.getLabel() * numFeatures + f];
+            }
+        }
+    }
+}
--- a/src/data.cu
+++ b/src/data.cu
@ -0,0 +1,98 @@
+/* 
+ * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <algorithm>
+#include <data.cuh>
+#include <vector>
+
+using namespace std;
+
+DataProvider::DataProvider(int minibatchSize) : 
+    _minibatchSize(minibatchSize), _hData(NULL) {
+
+}
+
+void DataProvider::clearData() {
+    delete _hData;
+    _hData = NULL;
+}
+
+void DataProvider::setData(CPUData& hData) {
+    // This is now deleted by the DataWorker's destructor
+//    delete _hData; // Delete old CPU matrices
+
+    _hData = &hData;
+}
+
+CPUData& DataProvider::getMinibatch(int idx) {
+    assert(idx >= 0 && idx < getNumMinibatches());
+    return getDataSlice(idx * _minibatchSize, (idx + 1) * _minibatchSize);
+}
+
+CPUData& DataProvider::getDataSlice(int startCase, int endCase) {
+    assert(_hData != NULL);
+    assert(_hData->getNumCases() > 0);
+    endCase = min(_hData->getNumCases(), endCase);
+    // TODO: maintain these matrices, no point re-creating them all the time
+    MatrixV& miniData = *new MatrixV();
+    
+    for (int i = 0; i < _hData->getData().size(); i++) {
+        // NOTE: if hData is transposed, then the output minibatch matrix
+        // can be a view. No need to allocate new CPU memory here. Might
+        // want to look into optimizing that in the future, though it's 
+        // unlikely to be a big deal.
+        if (_hData->isTrans()) {
+            miniData.push_back(&(*_hData)[i].sliceCols(startCase, endCase));
+        } else {
+            miniData.push_back(new Matrix());
+            (*_hData)[i].sliceCols(startCase, endCase, *miniData.back());
+        }
+    }
+    return *new CPUData(&miniData);
+}
+
+int DataProvider::getNumMinibatches() {
+    assert(_hData != NULL);
+    assert(_hData->getNumCases() > 0);
+    return DIVUP(_hData->getNumCases(), _minibatchSize);
+}
+
+int DataProvider::getMinibatchSize() {
+    return _minibatchSize;
+}
+
+int DataProvider::getNumCases() {
+    assert(_hData != NULL);
+    assert(_hData->getNumCases() > 0);
+    return _hData->getNumCases();
+}
+
+int DataProvider::getNumCasesInMinibatch(int idx) {
+    assert(_hData != NULL);
+    assert(_hData->getNumCases() > 0);
+    assert(idx >= 0 && idx < getNumMinibatches());
+    return min(_minibatchSize, max(0, _hData->getNumCases() - idx * _minibatchSize));
+}
--- a/src/hostmem.cu
+++ b/src/hostmem.cu
@ -0,0 +1,34 @@
+#include <hostmem.cuh>
+
+PinnedHostMem::PinnedHostMem() : _numBytes(0), _data(NULL) {
+
+}
+
+PinnedHostMem::~PinnedHostMem() {
+    if (_numBytes > 0) {
+        checkCudaErrors(cudaFreeHost(_data));
+    }
+}
+
+void PinnedHostMem::resize(uint bytes) {
+    if (_numBytes != bytes) {
+        if (_numBytes > 0) {
+            checkCudaErrors(cudaFreeHost(_data));
+        }
+        checkCudaErrors(cudaHostAlloc(&_data, bytes, cudaHostAllocPortable));
+        _numBytes = bytes;
+    }
+}
+
+void PinnedHostMem::copyFrom(void* src, uint bytes) {
+    resize(bytes);
+    checkCudaErrors(cudaMemcpy(_data, src, bytes, cudaMemcpyDefault));
+}
+
+void PinnedHostMem::copyTo(void* dst) {
+    checkCudaErrors(cudaMemcpy(dst, _data, _numBytes, cudaMemcpyDefault));
+}
+
+void* PinnedHostMem::getData() {
+    return _data;
+}
--- a/src/layer.cu
+++ b/src/layer.cu
--- a/src/layer_kernels.cu
+++ b/src/layer_kernels.cu
@ -0,0 +1,720 @@
+/* 
+ * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <vector>
+#include <layer_kernels.cuh>
+
+
+//#define LOG(x) ((x) > 0.0 ? log(x) : -1000.0)
+
+// Computes log(exp(x) + exp(y))
+//#define LOGADD(x, y) ()
+
+using namespace std;
+
+
+/*
+ * E = -log(y_t)
+ * probs:           (numOut, numCases)
+ * energies:        (numOut, numCases)
+ * labels:          (1, numCases)
+ * maxEnergies:     (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * top5Probs:       (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases)
+ * 
+ * This routine uses energeis to determine top-1 score because they're more accurate than top-n 
+ * probabilities, which have numerical errors in them.
+ */
+__global__ void kMultiSoftmaxCost_engs(float* probs, float* energies, float* labels, float* maxEnergies,
+                                  float* labelLogProbs, float* correctProbs, float* top5Probs,
+                                  const int numCases, const int numOut, const int setSize) {
+    const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
+
+    if (tx < numCases) {
+        const int label = int(labels[tx]);
+        const float maxe = maxEnergies[tx];
+        const float labelp = probs[label * numCases + tx];
+        const float labele = energies[label * numCases + tx];
+        
+        labelLogProbs[tx] = __logf(labelp);
+        
+        int numBiggerEnergies = 0, numEqualsEnergies = 0;
+        for (int i = 0; i < numOut; ++i) {
+            numBiggerEnergies += energies[i * numCases + tx] > labele;
+            numEqualsEnergies += energies[i * numCases + tx] == labele;
+        }
+
+        const int slotsLeft = setSize - numBiggerEnergies;
+        
+        top5Probs[tx] = slotsLeft <= 0 ? 0 : (numEqualsEnergies <= slotsLeft ? 1 : float(slotsLeft) / numEqualsEnergies);
+//        if (numEqualsEnergies != 1) {
+//            printf("numEqualsEnergies: %d, labelp: %e, maxp: %e\n", numEqualsEnergies, labelp, maxe);
+//        }
+        correctProbs[tx] = labele != maxe ? 0.0f : 1.0f / float(numEqualsEnergies);
+    }
+}
+
+/*
+ * E = -log(y_t)
+ * probs:           (numOut, numCases)
+ * labels:          (1, numCases)
+ * maxEnergies:     (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * top5Probs:       (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases)
+ * 
+ */
+__global__ void kMultiSoftmaxCost(float* probs, float* labels, float* maxProbs,
+                                  float* labelLogProbs, float* correctProbs, float* top5Probs,
+                                  const int numCases, const int numOut, const int setSize) {
+    const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
+
+    if (tx < numCases) {
+        const int label = int(labels[tx]);
+        const float maxp = maxProbs[tx];
+        const float labelp = probs[label * numCases + tx];
+        
+        labelLogProbs[tx] = __logf(labelp);
+        
+        int numBiggerProbs = 0, numEqualsProbs = 0;
+        for (int i = 0; i < numOut; ++i) {
+            numBiggerProbs += probs[i * numCases + tx] > labelp;
+            numEqualsProbs += probs[i * numCases + tx] == labelp;
+        }
+
+        const int slotsLeft = setSize - numBiggerProbs;
+        
+        top5Probs[tx] = slotsLeft <= 0.0f ? 0.0f : (numEqualsProbs <= slotsLeft ? 1.0f : float(slotsLeft) / numEqualsProbs);
+        correctProbs[tx] = labelp != maxp ? 0.0f : 1.0f / float(numEqualsProbs);
+    }
+}
+
+/*
+ * E = -log(y_t)
+ * probs:           (numOut, numCases)
+ * labels:          (1, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * top5Probs:       (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases) == log(y_l[labels,:]
+ */
+void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& energies, NVMatrix& labelLogProbs_out,
+                       NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize, bool useEnergies) {
+    int numCases = probs.getNumCols(); 
+    int numOut = probs.getNumRows(); 
+
+    assert(labels.getNumElements() == numCases);
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    assert(labels.isContiguous());
+    assert(probs.isContiguous());
+    assert(energies.isContiguous());
+    assert(energies.isSameDims(probs));
+    
+//    NVMatrix& maxProbs = probs.max(0);
+    NVMatrix& maxPE = useEnergies ? energies.max(0) : probs.max(0);
+    
+    labelLogProbs_out.resize(1, numCases);
+    correctProbs_out.resize(1, numCases);
+    top5Probs_out.resize(1, numCases);
+    dim3 threads(LOGREG_ERR_THREADS_X, 1);
+    dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
+    
+    if (useEnergies) {
+        cudaFuncSetCacheConfig(kMultiSoftmaxCost_engs, cudaFuncCachePreferL1);
+        kMultiSoftmaxCost_engs<<<blocks, threads>>>(probs.getDevData(), energies.getDevData(), labels.getDevData(), maxPE.getDevData(),
+                                        labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), top5Probs_out.getDevData(),
+                                        numCases, numOut, setSize);
+    } else {
+        cudaFuncSetCacheConfig(kMultiSoftmaxCost, cudaFuncCachePreferL1);
+        kMultiSoftmaxCost<<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), maxPE.getDevData(),
+                                        labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), top5Probs_out.getDevData(),
+                                        numCases, numOut, setSize);
+    }
+
+    getLastCudaError("computeLogregCost: Kernel execution failed");
+//    cudaThreadSynchronize();
+    delete &maxPE;
+}
+
+/*
+ * energies:    (numCases, numOut) (yes this is weird)
+ * bLattice:    (numOut + 1, setSize, numCases) (*out)
+ * 
+ * This is intended to work for cases when setSize <= 32.
+ * Block size (y, x) = (1, B_X)
+ * 
+ * NOTE: 
+ *  B_X must be a multiple of 32
+ */
+template <int B_X>
+__global__ void kMSMBackward(float* energies, float* bLattice, const int numCases, const int numOut, const int setSize) {
+    extern __shared__ float shmem[];
+    const int tidx = blockIdx.x * B_X + threadIdx.x;
+    const int casesPerWarp = 32 / setSize;
+    const int casesPerBlock = casesPerWarp * B_X / 32;
+    const int numWorkersPerWarp = casesPerWarp * setSize;
+    const int tidxInWarp = tidx % 32;
+    const int warpIdx = tidx / 32;
+    const int blockCaseIdx = blockIdx.x * casesPerBlock;
+    const int caseIdxInBlock = threadIdx.x / setSize;
+    const int caseIdx = warpIdx * casesPerWarp + tidxInWarp / setSize;
+    const bool doWork = tidxInWarp < numWorkersPerWarp && caseIdx < numCases;
+    
+    const int bIdx = threadIdx.x + threadIdx.x/setSize + 1;
+    volatile float* B = shmem;
+    volatile float* shE = &shmem[B_X + casesPerBlock]; // Dimensions (casesPerBlock, 32 + 1)
+
+    const int loadY = warpIdx;
+    const int loadX = tidxInWarp;
+    
+    energies += (blockCaseIdx + loadY) * numOut + loadX;
+    bLattice += tidx;
+    if (blockIdx.x != 0) {
+        return;
+    }
+    // The first row of the lattice has a 1 in the columns corresponding to 
+    // zero set size, 0 elsewhere.
+    for (int t = threadIdx.x; t < B_X + casesPerBlock; t += B_X) {
+        B[t] = t % setSize == 0;
+    }
+    
+    for (int l = 0; l < numOut / 32; ++l) { // Load 32 energies at a time for casesPerBlock cases
+        __syncthreads();
+        // Load energies into shmem
+        for (int r = 0; r < casesPerBlock && blockCaseIdx + loadY + r < numCases; r += B_X / 32) {
+            shE[(r + loadY) * (32 + 1) + loadX] = __expf(energies[r * numOut]);
+            printf("%f\n", energies[r * numOut]);
+        }
+        __syncthreads();
+        
+        // Compute 32 rows of the lattice
+        if (doWork) {
+            #pragma unroll
+            for (int i = 0; i < 32; ++i) {
+                B[bIdx] = B[bIdx - 1] * shE[caseIdxInBlock * (32 + 1) + i] + B[bIdx];
+                bLattice[i * numCases * setSize] = B[bIdx];
+//                printf("thread %d wrote %d to idx %d\n", tidx, B[bIdx], bIdx);
+            }
+        }
+        printf("thread %d made it\n", tidx);
+        bLattice += 32 * numCases * setSize;
+    }
+//    if (numOut % 32 != 0) {
+//        __syncthreads();
+//        
+//    }
+}
+
+/*
+ * energies:    (numCases, numOut) (yes this is weird)
+ * bLattice:    (numOut + 1, setSize, numCases) (*out)
+ */
+void MSMBackward(NVMatrix& energies, NVMatrix& bLattice, int setSize) {
+    int numCases = energies.getNumRows(); 
+    int numOut = energies.getNumCols();
+
+    assert(!energies.isTrans());
+    assert(!bLattice.isTrans());
+    assert(energies.isContiguous());
+    assert(energies.isContiguous());
+    
+    bLattice.resize((numOut + 1) * setSize, numCases);
+    int B_X = 32;
+    int casesPerBlock = B_X / setSize;
+    int shmem = 4*(B_X + casesPerBlock + casesPerBlock * (32 + 1));
+    dim3 threads(B_X, 1);
+    dim3 blocks(DIVUP(numCases*setSize, B_X), 1);
+    printf("allocating %d words of shmem\n", shmem/4);
+    cudaFuncSetCacheConfig(kMSMBackward<32>, cudaFuncCachePreferShared);
+    kMSMBackward<32><<<blocks, threads, shmem>>>(energies.getDevData(), bLattice.getDevData(), 
+                                     numCases, numOut, setSize);
+    getLastCudaError("kMSMBackward: Kernel execution failed");
+}
+
+/*
+ * E = sum(p_l * log(y_l))
+ * probs:           (numOut, numCases)
+ * labels:          (numOut, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases)
+ */
+__global__ void kCrossEntCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
+                            const int numCases, const int numOut) {
+    const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
+
+    if (tx < numCases) {
+        probs += tx;
+        labels += tx;
+        maxProbs += tx;
+        labelLogProbs += tx;
+        correctProbs += tx;
+        
+        const float maxp = maxProbs[0];
+
+        /*
+         * Compute the probability of guessing the correct case if you take the most-probable label.
+         * 
+         * This is done like this:
+         * 
+         * - If the most probable label is not equal to the true label, then the probability is zero.
+         * - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
+         * 
+         * This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
+         * maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
+         * Though it could never happen in reality. Well it could. But it wouldn't. Cool?
+         */
+        float crossEnt = 0.0f;
+        int numMax = 0;
+        bool correctLabel = false;
+        for (int i = 0; i < numOut; i++) {
+            const float label_prob = labels[i * numCases];
+            const float model_prob = probs[i * numCases];
+            numMax += model_prob == maxp;
+            crossEnt += label_prob * safelog(model_prob);
+            correctLabel |= model_prob == maxp && label_prob > 0.0f;
+        }
+        labelLogProbs[0] = crossEnt;
+        if (!correctLabel) {
+            correctProbs[0] = 0.0f;
+        } else {
+            correctProbs[0] = 1.0f / float(numMax);
+        }
+    }
+}
+
+/*
+ * E = sum(p_l * log(y_l))
+ * y_l:     (numOut, numCases)
+ * labels:  (numOut, numCases)
+ * 
+ * dE_dy_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kCrossEntGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
+                                 const int numOut, const float gradCoeff) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        const float label_prob = labels[tidx];
+        const float model_prob = y_l[tidx];
+        const float v = gradCoeff * __fdividef(label_prob, model_prob);
+        if (add) {
+            dE_dy_l[tidx] += v;
+        } else {
+            dE_dy_l[tidx] = v;
+        }
+    }
+}
+
+/*
+ * E = sum(p_l * log(y_l))
+ * y_l:     (numOut, numCases)
+ * labels:  (numOut, numCases)
+ * 
+ * dE_dx_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kCrossEntSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
+                                 const int numOut, const float gradCoeff) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        float v = 0;
+        const float model_prob = y_l[tidx];
+        for (int j = 0; j < numOut; j++) {
+            const float label_prob = labels[j * numCases + tx];
+            v += label_prob * ((j == ty) - model_prob);
+        }
+        v *= gradCoeff;
+        if (add) {
+            dE_dx_l[tidx] += v;
+        } else {
+            dE_dx_l[tidx] = v;
+        }
+    }
+}
+
+
+/*
+ * E = -log(y_t)
+ * probs:           (numOut, numCases)
+ * labels:          (1, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases)
+ */
+__global__ void kLogregCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
+                            const int numCases, const int numOut) {
+    const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
+
+    if (tx < numCases) {
+        const int label = int(labels[tx]);
+        const float maxp = maxProbs[tx];
+        const float labelp = probs[label * numCases + tx];
+        
+        labelLogProbs[tx] = __logf(labelp);
+        
+        /*
+         * Compute the probability of guessing the correct case if you take the most-probable label.
+         * 
+         * This is done like this:
+         * 
+         * - If the most probable label is not equal to the true label, then the probability is zero.
+         * - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
+         * 
+         * This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
+         * maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
+         * Though it could never happen in reality. Well it could. But it wouldn't. Cool?
+         */
+        if (labelp != maxp) {
+            correctProbs[tx] = 0;
+        } else {
+            int numMax = 0;
+            for (int i = 0; i < numOut; i++) {
+                numMax += probs[i * numCases + tx] == maxp;
+            }
+            correctProbs[tx] = 1.0f / float(numMax);
+        }
+    }
+}
+
+/*
+ * E = -log(y_t)
+ * y_l:     (numOut, numCases)
+ * labels:  (1, numCases)
+ * 
+ * dE_dy_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kLogregCostGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
+                                 const int numOut, const float gradCoeff) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        const int label = int(labels[tx]);
+        float v = gradCoeff * (label == ty);
+        v = __fdividef(v, y_l[tidx]);
+        if (add) {
+            dE_dy_l[tidx] += v;
+        } else {
+            dE_dy_l[tidx] = v;
+        }
+    }
+}
+
+/*
+ * E = -log(y_t)
+ * y_l:     (numOut, numCases)
+ * labels:  (1, numCases)
+ * 
+ * dE_dx_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kLogregSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
+                                 const int numOut, const float gradCoeff) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        const int label = int(labels[tx]);
+        float v = gradCoeff * ((label == ty) - y_l[tidx]);
+        if (add) {
+            dE_dx_l[tidx] += v;
+        } else {
+            dE_dx_l[tidx] = v;
+        }
+    }
+}
+
+/*
+ * dE_dy_l: (numOut, numCases)
+ * y_l:     (numOut, numCases)
+ * 
+ * dE_dx_l: (numOut, numCases)
+ */
+template <bool add>
+__global__ void kSoftmaxGrad(float* dE_dy_l, float* y_l, float* dE_dx_l, const int numCases, const int numOut) {
+    const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
+    const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
+    const int tidx = ty * numCases + tx;
+    
+    if (ty < numOut && tx < numCases) {
+        float v = 0;
+        for (int j = 0; j < numOut; j++) {
+            v += dE_dy_l[j * numCases + tx] * ((j == ty) - y_l[j * numCases + tx]);
+        }
+        v *= y_l[tidx];
+        
+        if (add) {
+            dE_dx_l[tidx] += v;
+        } else {
+            dE_dx_l[tidx] = v;
+        }
+    }
+}
+
+template <int B_X, bool add>
+__global__ void kEltwiseMaxGrad(float* actGrad, float* input, float* output, float* target,
+                                const int numElements) {
+    for (int i = B_X * blockIdx.x + threadIdx.x; i < numElements; i += B_X * gridDim.x) {
+        if (add) {
+            target[i] += actGrad[i] * (output[i] == input[i]);
+        } else {
+            target[i] = actGrad[i] * (output[i] == input[i]);
+        }
+    }
+}
+
+void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add) {
+    assert(actGrad.isContiguous());
+    assert(output.isContiguous());
+    assert(input.isContiguous());
+    assert(actGrad.isSameDims(input));
+    assert(actGrad.isSameDims(output));
+    
+    dim3 blocks(DIVUP(actGrad.getNumElements(), 128));
+    dim3 threads(128);
+    if (add) {
+        assert(actGrad.isSameDims(target));
+        cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, true>, cudaFuncCachePreferL1);
+        kEltwiseMaxGrad<128, true><<<blocks, threads>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
+    } else {
+        target.resize(actGrad);
+        cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, false>, cudaFuncCachePreferL1);
+        kEltwiseMaxGrad<128, false><<<blocks, threads>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
+    }
+    
+    getLastCudaError("computeEltwiseMaxGrad: Kernel execution failed");
+}
+
+/*
+ * E = sum_i{-p_i*log(y_i)}
+ * probs:           (numOut, numCases)
+ * labels:          (numOut, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases)
+ */
+void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
+    int numCases = probs.getNumCols(); 
+    int numOut = probs.getNumRows(); 
+
+    assert(labels.isSameDims(probs));
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    assert(labels.isContiguous());
+    assert(probs.isContiguous());
+    
+    NVMatrix& maxProbs = probs.max(0);
+    
+    labelLogProbs_out.resize(1, numCases);
+    correctProbs_out.resize(1, numCases);
+    dim3 threads(LOGREG_ERR_THREADS_X, 1);
+    dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
+    cudaFuncSetCacheConfig(kCrossEntCost, cudaFuncCachePreferL1);
+    kCrossEntCost<<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
+                                     labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
+                                     numCases, numOut);
+    getLastCudaError("kCrossEntCost: Kernel execution failed");
+
+    delete &maxProbs;
+}
+
+void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
+    int numCases = probs.getLeadingDim(); 
+    int numOut = probs.getFollowingDim(); 
+    assert(labels.isSameDims(probs));
+    assert(probs.isContiguous());
+    assert(target.isContiguous());
+    assert(labels.isContiguous());
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    if (!add) {
+        target.resize(probs);
+        kCrossEntGrad<false><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    } else {
+        kCrossEntGrad<true><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    }
+
+    getLastCudaError("kCrossEntGrad: Kernel execution failed");
+}
+
+void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, bool add) {
+    int numCases = acts.getLeadingDim();
+    int numOut = acts.getFollowingDim();
+
+    assert(acts.isSameDims(actsGrad));
+    assert(acts.isContiguous());
+    assert(actsGrad.isContiguous());
+    assert(target.isContiguous());
+    assert(acts.isTrans());
+    assert(actsGrad.isTrans());
+
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    if (!add) {
+        target.resize(acts);
+        kSoftmaxGrad<false><<<blocks, threads>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut);
+    } else {
+        kSoftmaxGrad<true><<<blocks, threads>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut);
+    }
+    getLastCudaError("computeSoftmaxGrad: Kernel execution failed");
+}
+
+void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
+    int numCases = probs.getLeadingDim(); 
+    int numOut = probs.getFollowingDim(); 
+    assert(labels.getLeadingDim() == probs.getLeadingDim() && labels.getFollowingDim() == probs.getFollowingDim());
+    assert(probs.isContiguous());
+    assert(target.isContiguous());
+    assert(labels.isContiguous());
+    assert(probs.isTrans());
+    assert(!labels.isTrans());
+    
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    if (!add) {
+        target.resize(probs);
+        cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<false>, cudaFuncCachePreferL1);
+        kCrossEntSoftmaxGrad<false><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                        numCases, numOut, coeff);
+    } else {
+        cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<true>, cudaFuncCachePreferL1);
+        kCrossEntSoftmaxGrad<true><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                        numCases, numOut, coeff);
+    }
+    getLastCudaError("kCrossEntSoftmaxGrad: Kernel execution failed");
+}
+
+/*
+ * E = -log(y_t)
+ * probs:           (numOut, numCases)
+ * labels:          (1, numCases)
+ * maxProbs:        (1, numCases)
+ * labelLogProbs:   (1, numCases)   (*out)
+ * correctProbs:    (1, numCases)   (*out)
+ * 
+ * target:          (1, numCases) == log(y_l[labels,:]
+ */
+void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
+    int numCases = probs.getNumCols(); 
+    int numOut = probs.getNumRows(); 
+
+    assert(labels.getNumElements() == numCases);
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    assert(labels.isContiguous());
+    assert(probs.isContiguous());
+    
+    NVMatrix& maxProbs = probs.max(0);
+    
+    labelLogProbs_out.resize(1, numCases);
+    correctProbs_out.resize(1, numCases);
+    dim3 threads(LOGREG_ERR_THREADS_X, 1);
+    dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
+    cudaFuncSetCacheConfig(kLogregCost, cudaFuncCachePreferL1);
+    kLogregCost<<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
+                                     labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
+                                     numCases, numOut);
+    getLastCudaError("computeLogregCost: Kernel execution failed");
+//    cudaThreadSynchronize();
+    delete &maxProbs;
+}
+
+void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
+    int numCases = probs.getLeadingDim(); 
+    int numOut = probs.getFollowingDim(); 
+    assert(labels.getNumElements() == numCases);
+    assert(probs.isContiguous());
+    assert(target.isContiguous());
+    assert(labels.isContiguous());
+    assert(!labels.isTrans());
+    assert(!probs.isTrans());
+    
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    if (!add) {
+        target.resize(probs);
+        kLogregCostGrad<false><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    } else {
+        kLogregCostGrad<true><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    }
+
+    getLastCudaError("computeLogregGrad: Kernel execution failed");
+}
+
+void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
+    int numCases = probs.getLeadingDim(); 
+    int numOut = probs.getFollowingDim(); 
+    assert(labels.getNumElements() == numCases);
+    assert(probs.isContiguous());
+    assert(target.isContiguous());
+    assert(labels.isContiguous());
+    assert(probs.isTrans());
+    
+    dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
+    dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
+    if (!add) {
+        target.resize(probs);
+        kLogregSoftmaxGrad<false><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    } else {
+        kLogregSoftmaxGrad<true><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
+                                                     numCases, numOut, coeff);
+    }
+
+    getLastCudaError("computeLogregSoftmaxGrad: Kernel execution failed");
+}
--- a/src/lr.cu
+++ b/src/lr.cu
@ -0,0 +1,186 @@
+#include <string>
+#include <lr.cuh>
+#include <util.cuh>
+
+using namespace std;
+
+/*
+ * ==================================
+ * LearningRateSchedule
+ * ==================================
+ */
+LearningRateSchedule& LearningRateSchedule::make(PyObject* lrsDict, double baseRate) {
+	string type = pyDictGetString(lrsDict, "type");
+	if (type == "default") {
+		return *new LearningRateSchedule(baseRate, 0);
+	} else {
+		PyObject* paramsDict = PyDict_GetItemString(lrsDict, "params");
+		double tgtFactor = pyDictGetFloat(paramsDict, "tgtFactor");
+		double noiseStdev = pyDictGetFloat(paramsDict, "noiseStdev");
+		if (type == "linear") {
+			return *new LinearLRS(baseRate, tgtFactor, noiseStdev);
+		} else if (type == "exp") {
+			return *new ExpLRS(baseRate, tgtFactor, noiseStdev);
+		} else if (type == "dexp") {
+			double numSteps = pyDictGetInt(paramsDict, "numSteps");
+			return *new DiscreteExpLRS(baseRate, tgtFactor, noiseStdev, numSteps);
+		} else if (type == "jdexp") {
+            double numSteps = pyDictGetInt(paramsDict, "numSteps");
+            return *new JumpyDiscreteExpLRS(baseRate, tgtFactor, noiseStdev, numSteps);
+        }
+	}
+	throw string("Unknown learning rate schedule type ") + type;
+}
+
+LearningRateSchedule::LearningRateSchedule(double baseRate, double noiseStdev)
+    : _baseRate(baseRate), _noiseStdev(noiseStdev), _haveRandnSpare(false), _randnSpare(0) {
+}
+
+LearningRateSchedule::LearningRateSchedule(double baseRate)
+    : _baseRate(baseRate), _noiseStdev(0), _haveRandnSpare(false), _randnSpare(0) {
+}
+
+double LearningRateSchedule::getRate(double progress) {
+	return _noiseStdev > 0 ? _getRate(progress) * (1 + abs(randn()) * _noiseStdev)
+	                       : _getRate(progress);
+}
+
+double LearningRateSchedule::_getRate(double progress) {
+    return _baseRate;
+}
+
+inline double LearningRateSchedule::randn() {
+    if (!_haveRandnSpare) {
+        double T = 2 * 3.1415 * rand();
+        double R = std::sqrt(-2 * std::log(rand()));
+        _randnSpare = R * std::sin(T);
+        _haveRandnSpare = true;
+        return R * std::cos(T);
+    }
+    _haveRandnSpare = false;
+    return _randnSpare;
+}
+
+// This should never generate zero
+inline double LearningRateSchedule::rand() const {
+    return double(1L + random()) / (1L + RAND_MAX);
+}
+
+inline double LearningRateSchedule::abs(double x) const {
+    return x > 0 ? x : -x;
+}
+
+double LearningRateSchedule::getBaseRate() const {
+	return _baseRate;
+}
+
+LearningRateSchedule::~LearningRateSchedule() {
+}
+
+/*
+ * ==================================
+ * LinearLRS
+ * ==================================
+ */
+LinearLRS::LinearLRS(double baseRate, double tgtFactor, double noiseStdev)
+: LearningRateSchedule(baseRate, noiseStdev) {
+	_finalRate = baseRate / tgtFactor;
+}
+
+double LinearLRS::_getRate(double progress) {
+	return _baseRate * (1 - progress) + _finalRate * progress;
+}
+
+/*
+ * ==================================
+ * ExpLRS
+ * ==================================
+ */
+ExpLRS::ExpLRS(double baseRate, double tgtFactor, double noiseStdev)
+: LearningRateSchedule(baseRate, noiseStdev) {
+	double finalRate = baseRate / tgtFactor;
+	_pow = baseRate == 0 ? 1 : (std::log(finalRate) / std::log(baseRate) - 1);
+}
+
+double ExpLRS::_getRate(double progress) {
+	return std::pow(_baseRate, 1.0 + progress * _pow);
+}
+
+/*
+ * ==================================
+ * TanhLRS
+ * ==================================
+ */
+TanhLRS::TanhLRS(double baseRate, double tgtFactor, double noiseStdev)
+: LearningRateSchedule(baseRate, noiseStdev), _alpha(0), _beta(0) {
+	if (baseRate > 0) {
+		double finalRate = baseRate / tgtFactor;
+		_beta = 0.5 * (baseRate + finalRate);
+		_alpha = 2 * atanh((baseRate - finalRate) / (baseRate + finalRate));
+	}
+}
+
+double TanhLRS::_getRate(double progress) {
+	return _beta * (tanh(-_alpha * (progress - 0.5)) + 1.0);
+}
+
+/*
+ * ==================================
+ * DiscreteExpLRS
+ * ==================================
+ */
+DiscreteExpLRS::DiscreteExpLRS(double baseRate, double tgtFactor, double noiseStdev, int numSteps)
+: LearningRateSchedule(baseRate, noiseStdev) {
+	ExpLRS elrs(baseRate, tgtFactor, 0);
+	double finalRate = baseRate / tgtFactor;
+	for (int i = 0; i < numSteps - 1; i++) {
+		double progress = double(i) / (numSteps - 1);
+		_rates.push_back(elrs._getRate(progress));
+	}
+	_rates.push_back(finalRate);
+	//printf("initialized base %e, final %e, stpes %d\n", baseRate, finalRate, numSteps);
+}
+
+double DiscreteExpLRS::_getRate(double progress) {
+	for (int i = 0; i < _rates.size(); ++i) {
+		if (progress <= double(i + 1) / _rates.size()) {
+			return _rates[i];
+		}
+	}
+	return _rates.back();
+}
+
+/*
+ * ==================================
+ * JumpyDiscreteExpLRS
+ * ==================================
+ */
+JumpyDiscreteExpLRS::JumpyDiscreteExpLRS(double baseRate, double tgtFactor, double noiseStdev, int numSteps)
+: DiscreteExpLRS(baseRate, tgtFactor, noiseStdev, numSteps) {
+}
+
+double JumpyDiscreteExpLRS::_getRate(double progress) {
+    int rateIdx = 0;
+    for (int i = 0; i < _rates.size(); ++i) {
+        if (progress <= double(i + 1) / _rates.size()) {
+            rateIdx = i;
+            break;
+        }
+    }
+    // The midpoint of the interval that progress falls into.
+    double intervalMid = double(rateIdx + 0.5) / _rates.size();
+    // Jumpy learning rate works like this:
+    // If progress is before the midpoint of the current interval,
+    //    it returns the same learning rate as would DiscreteExpLRS.
+    // Else,
+    //    it returns the learning rate of the *previous* interval (provided there is one).
+//    rateIdx -= rateIdx > 0 && progress > 0.2 && progress < 0.9 && progress > intervalMid;
+
+    // Uncomment this (and comment line above) to use variant 2:
+    // Instead of using the learning rate of the previous interval, this uses
+    // the geometric average of the learning rates of the current and previous
+    // intervals.
+    bool jump = rateIdx > 0 && progress > 0.2 && progress < 0.9 && progress > intervalMid;
+    return jump ? sqrt(_rates[rateIdx] * _rates[rateIdx - 1]) : _rates[rateIdx];
+//    return _rates[rateIdx];
+}
--- a/src/multisoftmax.cpp
+++ b/src/multisoftmax.cpp
@ -0,0 +1,126 @@
+
+#include <assert.h>
+//#include <mathimf.h>
+#include <multisoftmax.h>
+
+using namespace std;
+
+// Computes log(exp(x) + exp(y))
+inline double logadd(const double x, const double y) {
+    if (x <= -INF && y <= -INF) {
+        return -INF;
+    }
+    const double M = max(x,y);
+    const double m = min(x,y);
+    const double diff = M - m;
+//    return diff > 15 ? M : M + LOG(1.0 + EXP(-diff));
+//    return m <= -INF ? M : M + LOG(1.0f + EXP(-diff));
+    return diff > 15 ? M : (diff > 5 ? M + EXP(-diff) : M + LOG(1.0 + EXP(-diff)));
+}
+
+/*
+ * elts:     (numCases, numOut) 
+ * B:        (N + 1, size + 1) -- batckward lattice matrix, MUST BE initially -INF
+ * fixed:    (numCases, 1)
+ * probs:    (numCases, numOut) (*out)
+ * 
+ * double precision is much faster than single. :/
+ */
+void MultiSoftmaxCPU_T_logspace(Matrix& elts, Matrix& logB, Matrix& probs, Matrix& fixed, int size, bool nofix) {
+    int numCases = elts.getNumRows();
+    assert(probs.isSameDims(elts));
+    assert(!elts.isTrans());
+    assert(!logB.isTrans());
+    assert(!probs.isTrans());
+    assert(fixed.getNumRows() == numCases);
+    assert(fixed.getNumCols() == 1);
+    int N = elts.getNumCols();
+    Matrix& logF = *new Matrix(size + 1, 1); // Forward column
+
+    // Prepare logB
+    logB(N, 0) = 0;
+    
+    for (int c = 0; c < numCases; ++c) {
+        int fx = nofix ? -1 : int(fixed(c, 0));
+        // Backward pass
+        for (int i = N - 1; i >= 0; --i) {
+            double elt = elts(c, i);
+            logB(i, 0) = i <= fx ? -INF : 0.0f;
+            for (int s = max(1, size - i); s < size + 1; ++s) {
+                logB(i, s) = fx == i ? logB(i + 1, s - 1) + elt : logadd(logB(i + 1, s - 1) + elt, logB(i + 1, s));
+            }
+        }
+        // Log partition function
+        double logZ = logB(0, size);
+        
+        // Forward pass
+        logF.apply(Matrix::ONE);
+        logF.scale(-INF);
+        logF(0, 0) = 0;
+        
+        for (int i = 1; i < N + 1; ++i) {
+            double logy = -INF;
+            double elt = elts(c, i - 1);
+            for (int s = size; s >= 0; --s) {
+                if (s < size) {
+                    logy = logadd(logy, logF(s, 0) + logB(i, size - 1 - s));
+                }
+                if (s > 0) {
+                    logF(s, 0) = fx == i - 1 ? logF(s - 1, 0) + elt : logadd(logF(s - 1, 0) + elt, logF(s, 0));
+                } else if (fx == i - 1) {
+                    logF(0, 0) = -INF;
+                }
+            }
+            logy += elt - logZ;
+            probs(c, i - 1) = EXP(logy) - (fx >= 0 ? probs(c, i - 1) : 0);
+        }
+    }
+    
+    delete &logF;
+}
+
+MultiSoftmaxWorker::MultiSoftmaxWorker(Matrix* elts, Matrix* B, Matrix* probs, Matrix* fixed, int size, bool nofix) 
+    : Thread(true), _elts(elts), _B(B), _probs(probs), _fixed(fixed), _size(size), _nofix(nofix) {
+    
+}
+
+MultiSoftmaxWorker::~MultiSoftmaxWorker() {
+    delete _elts;
+    delete _probs;
+    delete _fixed;
+}
+
+void* MultiSoftmaxWorker::run() {
+    MultiSoftmaxCPU_T_logspace(*_elts, *_B, *_probs, *_fixed, _size, _nofix);
+    return NULL;
+}
+
+/*
+ * elts:     (numCases, numOut) 
+ * B:        vector of (N + 1, size + 1) -- batckward lattice matrix, should be initially zero
+ * fixed:    (numCases, 1)
+ * probs:    (numCases, numOut) (*out)
+ * 
+ * NOTE: remember to write a version of this for transposed matrices.
+ * It may end up being significantly faster, which is important if 
+ * I plan to use CPU for this.
+ */
+void MultiSoftmaxCPU_T_parallel(Matrix& elts, vector<Matrix*>& B, Matrix& probs, Matrix& fixed, int size, bool nofix) {
+    int numCases = elts.getNumRows();
+    int numWorkers = min(numCases, (int)B.size());
+    probs.resize(elts);
+    int casesPerWorker = DIVUP(numCases, B.size());
+    numWorkers = min(numWorkers, DIVUP(numCases, casesPerWorker));
+    vector<Thread*> workers;
+    for (int i = 0; i < numWorkers; ++i) {
+        Matrix* eltSlice = &elts.sliceRows(i * casesPerWorker, min(elts.getNumRows(), (long int)(i + 1) * casesPerWorker));
+        Matrix* probSlice = &probs.sliceRows(i * casesPerWorker, min(elts.getNumRows(), (long int)(i + 1) * casesPerWorker));
+        Matrix* fixedSlice = &fixed.sliceRows(i * casesPerWorker, min(elts.getNumRows(), (long int)(i + 1) * casesPerWorker));
+        workers.push_back(new MultiSoftmaxWorker(eltSlice, B[i], probSlice, fixedSlice, size, nofix));
+        workers[i]->start();
+    }
+    for (int i = 0; i < numWorkers; ++i) {
+        workers[i]->join();
+        delete workers[i];
+    }
+}
--- a/src/neuron.cu
+++ b/src/neuron.cu
@ -0,0 +1,85 @@
+/* 
+ * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <neuron.cuh>
+#include <util.cuh>
+
+using namespace std;
+
+Neuron& Neuron::makeNeuron(PyObject* neuronDict) {
+    string type = pyDictGetString(neuronDict, "type");
+    PyObject* neuronParamsDict = PyDict_GetItemString(neuronDict, "params");
+    
+    if (type == "relu") {
+        return *new ReluNeuron();
+    }
+    
+    if (type == "nrelu") {
+        return *new NoisyReluNeuron();
+    }
+    
+    if (type == "drelu") {
+        return *new DoubleReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
+    }
+    
+    if (type == "softrelu") {
+        return *new SoftReluNeuron();
+    }
+    
+    if (type == "brelu") {
+        return *new BoundedReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
+    }
+
+    if (type == "abs") {
+        return *new AbsNeuron();
+    }
+
+    if (type == "logistic") {
+        return *new LogisticNeuron();
+    }
+    
+    if (type == "tanh") {
+        return *new TanhNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
+    }
+    
+    if (type == "square") {
+        return *new SquareNeuron();
+    }
+    
+    if (type == "sqrt") {
+        return *new SqrtNeuron();
+    }
+    
+    if (type == "linear") {
+        return *new LinearNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
+    }
+
+    if (type == "ident") {
+        return *new Neuron();
+    }
+    
+    throw string("Unknown neuron type: ") + type;
+}
--- a/src/pyconvnet.cu
+++ b/src/pyconvnet.cu
@ -0,0 +1,242 @@
+/* 
+ * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <Python.h>
+#include <arrayobject.h>
+#include <assert.h>
+#include <helper_cuda.h>
+#include <cublas.h>
+#include <time.h>
+#include <vector>
+
+#include <matrix.h>
+#include <queue.h>
+#include <worker.cuh>
+#include <util.cuh>
+#include <cost.cuh>
+
+#include <pyconvnet.cuh>
+#include <convnet.cuh>
+
+using namespace std;
+static ConvNet* model = NULL;
+
+static PyMethodDef _ConvNetMethods[] = {{ "initModel",          initModel,              METH_VARARGS },
+                                        { "startBatch",         startBatch,             METH_VARARGS },
+                                        { "finishBatch",        finishBatch,            METH_VARARGS },
+                                        { "checkGradients",     checkGradients,         METH_VARARGS },
+                                        { "startMultiviewTest", startMultiviewTest,     METH_VARARGS },
+                                        { "startFeatureWriter", startFeatureWriter,     METH_VARARGS },
+                                        { "startDataGrad",      startDataGrad,          METH_VARARGS },
+                                        { "syncWithHost",       syncWithHost,           METH_VARARGS },
+                                        { NULL, NULL }
+};
+
+#if defined(_WIN64) || defined(_WIN32)
+extern "C" __declspec(dllexport) void initpyconvnet() {
+    (void) Py_InitModule("pyconvnet", _ConvNetMethods);
+    import_array();
+}
+#else
+void INITNAME() {
+    (void) Py_InitModule(QUOTEME(MODELNAME), _ConvNetMethods);
+    import_array();
+}
+#endif
+
+PyObject* initModel(PyObject *self, PyObject *args) {
+    assert(model == NULL);
+
+    PyDictObject* pyLayerParams;
+    PyListObject* pyDeviceIDs, *pyDeviceCPUs;
+    int pyMinibatchSize;
+    int pyWeightUpdateFreq;
+
+    if (!PyArg_ParseTuple(args, "O!O!O!ii",
+                          &PyDict_Type, &pyLayerParams,
+                          &PyList_Type, &pyDeviceIDs,
+                          &PyList_Type, &pyDeviceCPUs,
+                          &pyMinibatchSize,
+                          &pyWeightUpdateFreq)) {
+        return NULL;
+    }
+    intv& deviceIDs = *getIntV((PyObject*)pyDeviceIDs);
+    vector<intv*>& deviceCPUs = *new vector<intv*>();
+    for (int i = 0; i < PyList_GET_SIZE(pyDeviceCPUs); i++) {
+        intv* v = getIntV(PyList_GetItem((PyObject*)pyDeviceCPUs, i));
+        deviceCPUs.push_back(v);
+    }
+    model = new ConvNet((PyObject*)pyLayerParams,
+                        deviceIDs,
+                        deviceCPUs,
+                        pyMinibatchSize,
+                        pyWeightUpdateFreq);
+
+    model->start();
+    return Py_BuildValue("i", 0);
+}
+
+/*
+ * Starts training/testing on the given batch (asynchronous -- returns immediately).
+ */
+PyObject* startBatch(PyObject *self, PyObject *args) {
+    assert(model != NULL);
+    PyListObject* data;
+    double progress;
+    int test = 0;
+    if (!PyArg_ParseTuple(args, "O!d|i",
+        &PyList_Type, &data,
+        &progress,
+        &test)) {
+        return NULL;
+    }
+    CPUData* cpuData = new CPUData((PyObject*)data);
+    
+    TrainingWorker* wr = new TrainingWorker(*model, *cpuData, progress, test);
+    model->getWorkerQueue().enqueue(wr);
+    return Py_BuildValue("i", 0);
+}
+
+/*
+ * Starts testing on the given batch (asynchronous -- returns immediately).
+ */
+PyObject* startMultiviewTest(PyObject *self, PyObject *args) {
+    assert(model != NULL);
+    PyListObject* data;
+    int numViews;
+    PyArrayObject* pyProbs = NULL;
+    char* logregName = NULL;
+    if (!PyArg_ParseTuple(args, "O!i|O!s",
+        &PyList_Type, &data,
+        &numViews,
+        &PyArray_Type, &pyProbs,
+        &logregName)) {
+        return NULL;
+    }
+    CPUData* cpuData = new CPUData((PyObject*)data);
+    MultiviewTestWorker* wr = pyProbs == NULL ? new MultiviewTestWorker(*model, *cpuData, numViews)
+                                              : new MultiviewTestWorker(*model, *cpuData, numViews, *new Matrix(pyProbs), logregName);
+    model->getWorkerQueue().enqueue(wr);
+    return Py_BuildValue("i", 0);
+}
+
+PyObject* startFeatureWriter(PyObject *self, PyObject *args) {
+    assert(model != NULL);
+    PyListObject* data;
+    PyListObject* pyFtrs;
+    PyListObject* pyLayerNames;
+    if (!PyArg_ParseTuple(args, "O!O!O!",
+        &PyList_Type, &data,
+        &PyList_Type, &pyFtrs,
+        &PyList_Type, &pyLayerNames)) {
+        return NULL;
+    }
+    stringv* layerNames = getStringV((PyObject*)pyLayerNames);
+    CPUData* cpuData = new CPUData((PyObject*)data);
+    MatrixV* ftrs = getMatrixV((PyObject*)pyFtrs);
+    
+    FeatureWorker* wr = new FeatureWorker(*model, *cpuData, *ftrs, *layerNames);
+    model->getWorkerQueue().enqueue(wr);
+    return Py_BuildValue("i", 0);
+}
+
+PyObject* startDataGrad(PyObject *self, PyObject *args) {
+//    assert(model != NULL);
+//    PyListObject* data;
+//    int dataLayerIdx, softmaxLayerIdx;
+//    if (!PyArg_ParseTuple(args, "O!ii",
+//        &PyList_Type, &data,
+//        &dataLayerIdx, &softmaxLayerIdx)) {
+//        return NULL;
+//    }
+//    CPUData* cpuData = new CPUData((PyObject*)data);
+//    Matrix& ftrs = *mvec.back();
+//    mvec.pop_back();
+//    
+//    DataGradWorker* wr = new DataGradWorker(*model, *cpuData, ftrs, dataLayerIdx, softmaxLayerIdx);
+//    model->getWorkerQueue().enqueue(wr);
+    return Py_BuildValue("i", 0);
+}
+
+/*
+ * Waits for the trainer to finish training on the batch given to startBatch.
+ */
+PyObject* finishBatch(PyObject *self, PyObject *args) {
+    assert(model != NULL);
+    WorkResult* res = model->getResultQueue().dequeue();
+    assert(res != NULL);
+    assert(res->getResultType() == WorkResult::BATCH_DONE);
+    
+    Cost& cost = res->getResults();
+    PyObject* dict = PyDict_New();
+    CostMap& costMap = cost.getCostMap();
+    for (CostMap::const_iterator it = costMap.begin(); it != costMap.end(); ++it) {
+        PyObject* v = PyList_New(0);
+        for (vector<double>::const_iterator iv = it->second->begin(); iv != it->second->end(); ++iv) {
+            PyObject* f = PyFloat_FromDouble(*iv);
+            PyList_Append(v, f);
+        }
+        PyDict_SetItemString(dict, it->first.c_str(), v);
+    }
+    
+    PyObject* retVal = Py_BuildValue("Ni", dict, cost.getNumCases());
+    delete res; // Deletes cost too
+    return retVal;
+}
+
+PyObject* checkGradients(PyObject *self, PyObject *args) {
+    assert(model != NULL);
+    PyListObject* data;
+    if (!PyArg_ParseTuple(args, "O!",
+        &PyList_Type, &data)) {
+        return NULL;
+    }
+    CPUData* cpuData = new CPUData((PyObject*)data);
+    
+    GradCheckWorker* wr = new GradCheckWorker(*model, *cpuData);
+    model->getWorkerQueue().enqueue(wr);
+    WorkResult* res = model->getResultQueue().dequeue();
+    assert(res != NULL);
+    assert(res->getResultType() == WorkResult::BATCH_DONE);
+    delete res;
+    return Py_BuildValue("i", 0);
+}
+
+/*
+ * Copies weight matrices from GPU to system memory.
+ */
+PyObject* syncWithHost(PyObject *self, PyObject *args) {
+    assert(model != NULL);
+    SyncWorker* wr = new SyncWorker(*model);
+    model->getWorkerQueue().enqueue(wr);
+    WorkResult* res = model->getResultQueue().dequeue();
+    assert(res != NULL);
+    assert(res->getResultType() == WorkResult::SYNC_DONE);
+    
+    delete res;
+    return Py_BuildValue("i", 0);
+}
+
--- a/src/quantizer.cu
+++ b/src/quantizer.cu
@ -0,0 +1,65 @@
+#include <quantizer.cuh>
+
+using namespace std;
+
+/*=================
+ * Quantizer
+ * ================
+ */
+
+Quantizer& Quantizer::make(PyObject* lrsDict) {
+    string type = pyDictGetString(lrsDict, "type");
+    if (type == "default") {
+        return *new Quantizer();
+    } else if (type == "half") {
+        return *new HalfQuantizer();
+    }
+    throw string("Unknown quantizer type ") + type;
+}
+
+Quantizer::Quantizer() : _numRows(0), _numCols(0), _trans(false) {
+}
+
+Quantizer::~Quantizer() {
+}
+
+void Quantizer::quantize(NVMatrix& src, NVMatrix& tgt) {
+    _quantize(src, tgt);
+    _quantized = &tgt;
+    _numRows = src.getNumRows();
+    _numCols = src.getNumCols();
+    _trans = src.isTrans();
+}
+
+void Quantizer::dequantize(NVMatrix& tgt, float scaleTarget, float scaleOutput) {
+    _dequantize(tgt, scaleTarget, scaleOutput);
+    tgt.setTrans(_trans);
+    tgt.reshape(_numRows, _numCols);
+}
+
+void Quantizer::dequantize(NVMatrix& tgt) {
+    dequantize(tgt, 0, 1);
+}
+
+void Quantizer::_quantize(NVMatrix& src, NVMatrix& tgt) {
+    src.copy(tgt);
+}
+
+void Quantizer::_dequantize(NVMatrix& tgt, float scaleTarget, float scaleOutput) {
+    tgt.add(*_quantized, scaleTarget, scaleOutput);
+}
+
+/*=================
+ * HalfQuantizer
+ * ================
+ */
+HalfQuantizer::HalfQuantizer() : Quantizer() {
+}
+
+void HalfQuantizer::_quantize(NVMatrix& src, NVMatrix& tgt) {
+    convQuantizeHalf(src, tgt);
+}
+
+void HalfQuantizer::_dequantize(NVMatrix& tgt, float scaleTarget, float scaleOutput) {
+    convDequantizeHalf(*_quantized, tgt, _numRows * _numCols, scaleTarget, scaleOutput);
+}
--- a/src/softmaxtree.cu
+++ b/src/softmaxtree.cu
@ -0,0 +1,441 @@
+#include <softmaxtree.cuh>
+
+#include "layer.cuh"
+
+using namespace std;
+
+/*
+ * This launches a series of blocks for every node at a given depth.
+ * The "series" just spans the length of the weight vectors.
+ * 
+ * The operation performed is (loosely):
+ *     targets[d] := weights[d] + targets[d-1]
+ * 
+ * Block size: (y, x) = (1, B_X)
+ * Grid size:  (y, x) = (numNodesAtDepth, ceil(numFeatures/B_X))
+ * 
+ * weights: (numNodes, numFeatures)
+ * nodes:   numNodesAtDepth-length array of ushort2 
+ *          where x coordinate gives node idx and y coordinate gives parent idx
+ * targets: (numNodes, numFeatures)
+ * 
+ */
+template<int B_X,bool root>
+__global__ void kSoftmaxTreeFwd(float* weights, ushort2* nodes, float* targets, const int numFeatures) {
+    __shared__ ushort2 node; // node.x == node idx, node.y == parent node idx
+    const int depthNodeIdx = blockIdx.y;
+    const int featureOffset = blockIdx.x * B_X + threadIdx.x;
+    if (threadIdx.x == 0) {
+        node = nodes[depthNodeIdx];
+    }
+    __syncthreads();
+    weights += featureOffset;
+    targets += featureOffset;
+    // No loops for now
+    if (featureOffset < numFeatures) {
+        if (root) {
+            targets[node.x * numFeatures] = weights[numFeatures * node.x];
+        } else {
+            targets[node.x * numFeatures] = targets[node.y * numFeatures] + weights[numFeatures * node.x];
+        }
+    }
+}
+
+/*
+ * This launches a series of blocks for every node at a given height.
+ * The "series" just spans the length of the weight vectors.
+ * 
+ * The operation performed is (loosely):
+ *     grads[h] := sum_d{grads[h-1]}
+ * 
+ * Block size: (y, x) = (1, B_X)
+ * Grid size:  (y, x) = (numNodesAtHeight, ceil(numFeatures/B_X))
+ * 
+ * grads:   (numNodes, numFeatures)
+ * nodes:   numNodesAtHeight-length array of ushort2 
+ *          where x coordinate gives node idx and y coordinate gives NUMBER OF CHILDREN 
+ *                                                                   ^ (note difference with kSoftmaxTreeFwd)
+ * childrenPtrs: numNodesAtHeight-length array of pointers to children indices
+ * 
+ * The idea is to start one of these grids at each height, in sequence, starting
+ * from height = 1.
+ * 
+ * The rows 0-numLabels-1 of grads must already have the correct softmax gradients (these
+ * are the nodes at height = 0).
+ * 
+ */
+template<int B_X>
+__global__ void kSoftmaxTreeBwd(float* grads, ushort2* nodes, ushort** childrenPtrs, const int numFeatures) {
+    __shared__ ushort2 node; // node.x == node idx, node.y == parent node idx
+    __shared__ ushort* childrenPtr;
+    __shared__ ushort children[B_X];
+    const int heightNodeIdx = blockIdx.y;
+    const int featureOffset = blockIdx.x * B_X + threadIdx.x;
+    if (threadIdx.x == 0) {
+        node = nodes[heightNodeIdx];
+        childrenPtr = childrenPtrs[heightNodeIdx];
+    }
+    __syncthreads();
+    
+    grads += featureOffset;
+    const int nodeIdx = node.x;
+    const int numChildren = node.y;
+    
+    float nodeGrad = 0;
+    for (int c = 0; c < numChildren; c += B_X) {
+        
+        if (c + threadIdx.x < numChildren) {
+            children[threadIdx.x] = childrenPtr[c + threadIdx.x];
+        }
+        __syncthreads();
+        if (featureOffset < numFeatures) {
+            const int numChildrenLeft = min(B_X, numChildren - c);
+            for  (int cc = 0; cc < numChildrenLeft; ++cc) {
+                const int childIdx = children[cc];
+                //const int childIdx = childrenPtr[c + cc];
+                nodeGrad += grads[childIdx * numFeatures];
+            }
+        }
+        __syncthreads();
+    }
+    if (featureOffset < numFeatures) {
+        grads[nodeIdx * numFeatures] = nodeGrad;
+    }
+}
+
+/*
+ * 
+ * Block size: (y, x) = (1, B_X)
+ * Grid size:  (y, x) = (1, numNodes)
+ * 
+ * weights:     (numNodes, numFeatures)
+ * weightsInc:  (numNodes, numFeatures)
+ * weightsGrad: (numNodes, numFeatures)
+ * nodeSizes:   numNodes-array whose ith element gives number of leaves under
+ *              node with label i.
+ * 
+ * TODO: why did I make nodeSizes ushort? int would prolly be fine.
+ */
+template<int B_X>
+__global__ void kSoftmaxTreeUpdateWeights(float* weights, float* weightsInc, float* weightsGrad,
+                                          ushort* nodeSizes, const int numFeatures,
+                                          float eps, const float mom, float wc) {
+    __shared__ int nodeSize; // node.x == node idx, node.y == parent node idx
+    const int nodeIdx = blockIdx.x;
+    if (threadIdx.x == 0) {
+        nodeSize = nodeSizes[nodeIdx];
+    }
+    __syncthreads();
+    weights += nodeIdx * numFeatures;
+    weightsInc += nodeIdx * numFeatures;
+    weightsGrad += nodeIdx * numFeatures;
+    
+    // TODO: make these shared?
+//    eps *= sqrtf(nodeSize);
+    wc /= nodeSize;
+    eps /= nodeSize; // larger epsw at the leaves
+    
+    for (int f = threadIdx.x; f < numFeatures; f += B_X) {
+        const float inc = mom * weightsInc[f] + eps * (weightsGrad[f] - wc * weights[f]);
+        weightsInc[f] = inc;
+        weights[f] += inc;
+    }
+}
+
+/*
+ * ==================
+ * SoftmaxNode
+ * ==================
+ */
+int SoftmaxNode::setDistances(std::map<int, SoftmaxNodeV*>& nodeHeights,
+                              std::map<int, SoftmaxNodeV*>& nodeDepths) {
+    _height = 0;
+    for (SoftmaxNodeV::iterator it = _children.begin(); it != _children.end(); ++it) {
+        _height = max(_height, (*it)->setDistances(nodeHeights, nodeDepths));
+    }
+    _height += _children.size() > 0;
+    if (nodeHeights.count(_height) == 0) {
+        nodeHeights[_height] = new SoftmaxNodeV();
+    }
+    if (nodeDepths.count(_depth) == 0) {
+        nodeDepths[_depth] = new SoftmaxNodeV();
+    }
+
+    nodeHeights[_height]->push_back(this);
+    nodeDepths[_depth]->push_back(this);
+    return _height;
+}
+
+void SoftmaxNode::setNodeCounts(int &nodes, int& leaves) {
+    nodes++;
+    leaves += _children.size() == 0;
+    for (SoftmaxNodeV::iterator it = _children.begin(); it != _children.end(); ++it) {
+        (*it)->setNodeCounts(nodes, leaves);
+    }
+}
+
+int SoftmaxNode::setSizes(ushort* nodeSizes) {
+    _size = _children.size() == 0;
+    for (SoftmaxNodeV::iterator it = _children.begin(); it != _children.end(); ++it) {
+        _size += (*it)->setSizes(nodeSizes);
+    }
+    nodeSizes[_label] = _size;
+    return _size;
+}
+
+SoftmaxNode::SoftmaxNode(SoftmaxNode* parent, int label) 
+    : _parent(parent), _label(label), _size(0), _height(0) {
+    _depth = parent == NULL ? 0 : parent->getDepth() + 1;
+}
+
+SoftmaxNode::~SoftmaxNode() {
+    for (SoftmaxNodeV::iterator it = _children.begin(); it != _children.end(); ++it) {
+        delete *it;
+    }
+}
+
+int SoftmaxNode::getDepth() const {
+    return _depth;
+}
+
+int SoftmaxNode::getHeight() const {
+    return _height;
+}
+
+int SoftmaxNode::getSize() const {
+    return _size;
+}
+
+int SoftmaxNode::getLabel() const {
+    return _label;
+}
+
+SoftmaxNode* SoftmaxNode::getParent() {
+    return _parent;
+}
+
+SoftmaxNodeV& SoftmaxNode::getChildren() {
+    return _children;
+}
+
+SoftmaxNode& SoftmaxNode::addChild(int label) {
+    _children.push_back(new SoftmaxNode(this, label));
+    return *_children.back();
+}
+
+/*
+ * ==================
+ * SoftmaxTree
+ * ==================
+ */
+SoftmaxTree::SoftmaxTree(int rootLabel)  {
+    _root = new SoftmaxNode(NULL, rootLabel);
+    _nodeSizes = NULL;
+    _numNodes = 0;
+    _numLeaves = 0;
+}
+
+SoftmaxTree::~SoftmaxTree() {
+    checkCudaErrors(cudaFreeHost(_nodeSizes));
+    
+    for (map<int, SoftmaxNodeV*>::iterator it = _nodeHeights.begin(); it != _nodeHeights.end(); ++it) {
+        int height = it->first;
+        SoftmaxNodeV& nodes = *it->second;
+        for (int n = 0; n < nodes.size(); n++) {
+            checkCudaErrors(cudaFreeHost(_nodeChildMeta[height][n]));
+        }
+        checkCudaErrors(cudaFreeHost(_nodeChildMeta[height]));
+        checkCudaErrors(cudaFreeHost(_nodeChildMeta[height]));
+        delete &nodes;
+    }
+    for (map<int, SoftmaxNodeV*>::iterator it = _nodeDepths.begin(); it != _nodeDepths.end(); ++it) {
+        SoftmaxNodeV& nodes = *it->second;
+        int depth = it->first;
+        checkCudaErrors(cudaFreeHost(_nodeFwdMeta[depth]));
+        delete &nodes;
+    }
+    
+    delete _root;
+}
+
+void SoftmaxTree::setFwdMeta() {
+    for (map<int, SoftmaxNodeV*>::iterator it = _nodeDepths.begin(); it != _nodeDepths.end(); ++it) {
+        SoftmaxNodeV& nodes = *it->second;
+        ushort2* meta;
+        checkCudaErrors(cudaHostAlloc(&meta, sizeof(ushort2) * nodes.size(), cudaHostAllocPortable));
+        int depth = it->first;
+        _nodeFwdMeta[depth] = meta;
+        for (int n = 0; n < nodes.size(); n++) {
+            meta[n].x = nodes[n]->getLabel();
+            // Setting the root to have parent 0 is ok because the fwd kernel won't
+            // query this anyway when root == true.
+            meta[n].y = nodes[n]->getParent() == NULL ? 0 : nodes[n]->getParent()->getLabel();
+        }
+    }
+}
+
+void SoftmaxTree::setBwdMeta() {
+    for (map<int, SoftmaxNodeV*>::iterator it = _nodeHeights.begin(); it != _nodeHeights.end(); ++it) {
+        SoftmaxNodeV& nodes = *it->second;
+        ushort2* meta;
+        ushort** childMeta;
+        checkCudaErrors(cudaHostAlloc(&meta, sizeof(ushort2) * nodes.size(), cudaHostAllocPortable));
+        checkCudaErrors(cudaHostAlloc(&childMeta, sizeof(ushort*) * nodes.size(), cudaHostAllocPortable));
+        int height = it->first;
+        _nodeBwdMeta[height] = meta;
+        _nodeChildMeta[height] = childMeta;
+        for (int n = 0; n < nodes.size(); n++) {
+            checkCudaErrors(cudaHostAlloc(&childMeta[n], sizeof(ushort) * nodes[n]->getChildren().size(), cudaHostAllocPortable));
+            for (int c = 0; c < nodes[n]->getChildren().size(); c++) {
+                childMeta[n][c] = nodes[n]->getChildren()[c]->getLabel();
+            }
+            meta[n].x = nodes[n]->getLabel();
+            meta[n].y = nodes[n]->getChildren().size();
+        }
+    }
+}
+
+void SoftmaxTree::setDistances() {
+    _nodeHeights.clear();
+    _nodeDepths.clear();
+    _root->setDistances(_nodeHeights, _nodeDepths);
+}
+
+void SoftmaxTree::setNodeCounts() {
+    _numNodes = 0;
+    _numLeaves = 0;
+    _root->setNodeCounts(_numNodes, _numLeaves);
+}
+
+void SoftmaxTree::setNodeSizes() {
+    assert(_numLeaves > 0);
+    checkCudaErrors(cudaHostAlloc(&_nodeSizes, sizeof(ushort) * _numNodes, cudaHostAllocPortable));
+    _root->setSizes(_nodeSizes);
+}
+
+void SoftmaxTree::finalize() {
+    setDistances();
+    setNodeCounts();
+    setNodeSizes();
+    setFwdMeta();
+    setBwdMeta();
+}
+
+SoftmaxNode& SoftmaxTree::getRoot() {
+    return *_root;
+}
+
+SoftmaxNodeV& SoftmaxTree::getNodesAtHeight(int height) {
+    return *_nodeHeights[height];
+}
+
+SoftmaxNodeV& SoftmaxTree::getNodesAtDepth(int depth) {
+    return *_nodeDepths[depth];
+}
+
+int SoftmaxTree::getHeight() const {
+    return _root->getHeight();
+}
+
+/*
+ * A tree with only a root is taken to have depth 0.
+ */
+int SoftmaxTree::getDepth() const {
+    return _nodeDepths.size() - 1;
+}
+
+int SoftmaxTree::getNumLeaves() const {
+    return _numLeaves;
+}
+
+int SoftmaxTree::getNumNodes() const {
+    return _numNodes;
+}
+
+/*
+* offsets: (numNodes, numFeatures)
+* targets: (numNodes, numFeatures) 
+*/
+void SoftmaxTree::makeWeights(NVMatrix& offsets, NVMatrix& targets) {
+    preprocess(offsets);
+    preprocess(targets);
+    assert(offsets.getNumRows() == _numNodes);
+    assert(targets.isSameDims(offsets));
+    int numFeatures = offsets.getNumCols();
+    dim3 threads = dim3(256); // 256 seems to work best on dummy binary tree
+    dim3 blocks = dim3(DIVUP(numFeatures, 256), 1); // Only the root is at depth 0
+    cudaFuncSetCacheConfig(kSoftmaxTreeFwd<256, true>, cudaFuncCachePreferL1);
+    cudaFuncSetCacheConfig(kSoftmaxTreeFwd<256, false>, cudaFuncCachePreferL1);
+    kSoftmaxTreeFwd<256, true><<<blocks, threads>>>(offsets.getDevData(), _nodeFwdMeta[0], targets.getDevData(), numFeatures);
+    getLastCudaError("kSoftmaxTreeFwd: kernel execution failed");
+    for (int d = 1; d <= getDepth(); d++) {
+        blocks = dim3(DIVUP(numFeatures, 256), _nodeDepths[d]->size());
+        kSoftmaxTreeFwd<256, false><<<blocks, threads>>>(offsets.getDevData(), _nodeFwdMeta[d], targets.getDevData(), numFeatures);
+        getLastCudaError("kSoftmaxTreeFwd: kernel execution failed");
+    }
+    
+    postprocess(offsets);
+    postprocess(targets);
+}
+
+/*
+* grads: (numNodes, numFeatures)
+* 
+* The idea is that grads contains gradients for the leaves 
+* (i.e. the first numLabels rows), so this routine will
+* distribute them up the tree. 
+* 
+*/
+void SoftmaxTree::distributeGradients(NVMatrix& grads) {
+    preprocess(grads);
+    assert(grads.getNumRows() == _numNodes);
+    int numFeatures = grads.getNumCols();
+    // The leaves (nodes at height = 0) already have gradients computed.
+    // So start at the nodes at height = 1.
+    dim3 threads = dim3(512); // this block size works best :/
+    cudaFuncSetCacheConfig(kSoftmaxTreeBwd<512>, cudaFuncCachePreferL1);
+    for (int h = 1; h <= getHeight(); ++h) {
+        dim3 blocks = dim3(DIVUP(numFeatures, 512), _nodeHeights[h]->size());
+        kSoftmaxTreeBwd<512><<<blocks, threads>>>(grads.getDevData(), _nodeBwdMeta[h], _nodeChildMeta[h],  numFeatures);
+        getLastCudaError("kSoftmaxTreeBwd: kernel execution failed");
+    }
+    postprocess(grads);
+}
+
+/*
+ * inc := mom * inc - wc * epsW * weight + epsW * grad
+ * weight := weight + inc
+ * 
+ * weights: (numNodes, numFeatures)
+ * incs:    (numNodes, numFeatures)
+ * grads:   (numNodes , numFeatures)
+ */
+void SoftmaxTree::updateWeights(NVMatrix& weights, NVMatrix& incs, NVMatrix& grads, float epsWBase, float mom, float wcBase) {
+    preprocess(weights);
+    preprocess(incs);
+    preprocess(grads);
+    
+    assert(grads.getNumRows() == _numNodes);
+    assert(grads.isSameDims(incs));
+    assert(grads.isSameDims(weights));
+    int numFeatures = grads.getNumCols();
+    dim3 threads = dim3(512);
+    dim3 blocks = dim3(_numNodes);
+    cudaFuncSetCacheConfig(kSoftmaxTreeUpdateWeights<512>, cudaFuncCachePreferL1);
+    kSoftmaxTreeUpdateWeights<512><<<blocks, threads>>>(weights.getDevData(), incs.getDevData(), grads.getDevData(),
+                                                        _nodeSizes, numFeatures, epsWBase, mom, wcBase);
+    getLastCudaError("kSoftmaxTreeUpdateWeights: kernel execution failed");
+    weights.transpose();
+    incs.transpose();
+    grads.transpose();
+}
+
+void SoftmaxTree::preprocess(NVMatrix& inp) {
+    inp.transpose();
+    assert(!inp.isTrans());
+    assert(inp.isContiguous());
+}
+
+void SoftmaxTree::postprocess(NVMatrix& inp) {
+    inp.transpose();
+}
--- a/src/test.cu
+++ b/src/test.cu
@ -0,0 +1,378 @@
+#include <iostream>
+#include <stdlib.h>
+#include <vector>
+#include <set>
+#include <test.cuh>
+#include <layer_kernels.cuh>
+#include <multisoftmax.h>
+#include <cpuCNN.cuh>
+
+static StopWatchInterface *timer = NULL;
+using namespace std;
+void init_tests(int boardNum) {
+	cudaSetDevice(boardNum > -1 ? boardNum : 0);
+//    cublasInit();
+	NVMatrix::initCublas();
+    NVMatrix::initRandom(7);
+    sdkCreateTimer(&timer);
+}
+
+void compareResults(Matrix& cpu, NVMatrix& gpu, const char* matrixName) {
+    Matrix gpuOnCPU(cpu);
+    gpu.copyToHost(gpuOnCPU);
+    gpuOnCPU.subtract(cpu);
+    gpuOnCPU.apply(Matrix::ABS);
+    printf("Max diff between CPU/GPU matrices %s: %.6f\n", matrixName, gpuOnCPU.max());
+}
+
+void test_blattice() {
+    printf("===============================\n");
+    printf("test_blattice\n");
+    printf("===============================\n");
+
+    int numCases = 2;
+    int numOut = 32;
+    int setSize = 3;
+
+    cout << "numCases: " << numCases << endl;
+    cout << "numOut: " << numOut << endl;
+    cout << "setSize: " << setSize << endl;
+    NVMatrix nvEnergies(numCases, numOut);
+    Matrix energies(numCases, numOut);
+    Matrix bLattice(numOut, numCases * setSize);
+
+    nvEnergies.randomizeUniform();
+    nvEnergies.copyToHost(energies);
+    //energies.randomizeUniform();
+    bLattice.apply(Matrix::ZERO); // for now
+    
+    Matrix &enMax = energies.max(1);
+    energies.addVector(enMax, -1);
+    
+    nvEnergies.copyFromHost(energies);
+    NVMatrix nvBLattice(bLattice, true);
+
+    sdkResetTimer(&timer);
+    sdkStartTimer(&timer);
+    
+    MSMBackward(nvEnergies, nvBLattice, setSize);
+
+    cudaThreadSynchronize();
+    sdkStopTimer(&timer);
+    
+    printf("Energies: \n");
+    nvEnergies.print(10, 5);
+    
+    printf("GPU (partial) result:\n");
+    nvBLattice.print(0, 5, 0, 5);
+    printf("GPU time: %.6f msec\n", sdkGetTimerValue(&timer));
+}
+
+//void test_multiSoftmaxCPU() {
+//    printf("===============================\n");
+//    printf("test_multiSoftmaxCPU\n");
+//    printf("===============================\n");
+//
+//    int numCases = 2;
+//    int numOut = 5;
+//    int setSize = 3;
+//    
+////    int numCases = 128;
+////    int numOut = 1000;
+////    int setSize = 5;
+//
+//    cout << "numCases: " << numCases << endl;
+//    cout << "numOut: " << numOut << endl;
+//    cout << "setSize: " << setSize << endl;
+//    
+//    Matrix energies(numCases, numOut);
+//    Matrix B(numOut + 1, setSize + 1);
+//    Matrix probs(energies);
+//    energies.randomizeUniform();
+//    probs.apply(Matrix::ZERO); // for now
+//    
+//    Matrix &enMax = energies.max(1);
+//    energies.addVector(enMax, -1);
+//    B.apply(Matrix::ZERO);
+//
+//    sdkResetTimer(&timer);
+//    sdkStartTimer(&timer);
+//
+//    MultiSoftmaxCPU_T(energies, B, probs, setSize, -1);
+//
+//    cudaThreadSynchronize();
+//    sdkStopTimer(&timer);
+//    
+//    printf("Energies: \n");
+//    energies.print(10, 5);
+//    
+//    printf("CPU (partial) result:\n");
+//    probs.print(0, 5, 0, 5);
+//    printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
+//}
+
+void test_multiSoftmaxCPU_parallel() {
+    printf("===============================\n");
+    printf("test_multiSoftmaxCPU_parallel\n");
+    printf("===============================\n");
+
+    int workers = 8;
+    
+    int numCases = 2;
+    int numOut = 5;
+    int setSize = 2;
+    
+//    int numCases = 128;
+//    int numOut = 1000;
+//    int setSize = 5;
+
+    cout << "workers: " << workers << endl;
+    cout << "numCases: " << numCases << endl;
+    cout << "numOut: " << numOut << endl;
+    cout << "setSize: " << setSize << endl;
+    
+    NVMatrix nvEnergies(numCases, numOut);
+    Matrix energies(numCases, numOut);
+    vector<Matrix*> B;
+    Matrix probs(energies);
+    Matrix fixed(numCases, 1);
+    nvEnergies.randomizeUniform();
+    nvEnergies.copyToHost(energies);
+    //energies.randomizeUniform();
+    probs.apply(Matrix::ZERO); // for now
+    
+    Matrix &enMax = energies.max(1);
+    energies.addVector(enMax, -1);
+    
+    fixed.apply(Matrix::ONE);
+    fixed.scale(2);
+    
+    for (int i = 0; i < workers; i++) {
+        B.push_back(new Matrix(numOut + 1, setSize + 1));
+        B[i]->apply(Matrix::ONE);
+        B[i]->scale(-INF);
+    }
+
+    sdkResetTimer(&timer);
+    sdkStartTimer(&timer);
+
+    MultiSoftmaxCPU_T_parallel(energies, B, probs, fixed, setSize, true);
+
+    cudaThreadSynchronize();
+    sdkStopTimer(&timer);
+    
+    printf("Energies: \n");
+    energies.print(10, 10);
+    
+    printf("CPU (partial) result:\n");
+    probs.print(0, 5, 0, 10);
+    printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
+}
+
+SoftmaxTree* makeDummyTree(int depth) {
+    int numNodes = (1 << (depth + 1)) - 1;
+    int numLeaves = (numNodes + 1) / 2;
+    
+    int idx = numNodes - 1;
+    SoftmaxTree* tree = new SoftmaxTree(idx--);
+    vector<SoftmaxNode*> prevLevel;
+    
+    prevLevel.push_back(&tree->getRoot());
+    while (idx >= 0) {
+        int sz = prevLevel.size();
+        for (int i = 0; i < sz; i++) {
+            SoftmaxNode& node = *prevLevel[0];
+            SoftmaxNode& child1 = node.addChild(idx--);
+            SoftmaxNode& child2 = node.addChild(idx--);
+            prevLevel.push_back(&child1);
+            prevLevel.push_back(&child2);
+            prevLevel.erase(prevLevel.begin());
+        }
+    }
+    tree->finalize();
+    assert(tree->getNumLeaves() == numLeaves);
+    assert(tree->getNumNodes() == numNodes);
+    return tree;
+}
+
+void test_sftree_fwd() {
+    printf("===============================\n");
+    printf("test_sftree_fwd\n");
+    printf("===============================\n");
+
+    int numFeatures = 6*6*128;
+    int depth = 10;
+    SoftmaxTree* tree = makeDummyTree(depth);
+    cout << "numFeatures: " << numFeatures << endl;
+    cout << "depth: " << depth << endl;
+    cout << "numNodes: " << tree->getNumNodes() << endl;
+    cout << "numLabels: " << tree->getNumLeaves() << endl;
+    
+    Matrix weights(tree->getNumNodes(), numFeatures);
+    Matrix targets(tree->getNumNodes(), numFeatures);
+    NVMatrix nvWeights(tree->getNumNodes(), numFeatures);
+    NVMatrix nvTargets(tree->getNumNodes(), numFeatures);
+
+    weights.randomizeUniform();
+
+    nvWeights.copyFromHost(weights);
+    
+    sdkResetTimer(&timer);
+    sdkStartTimer(&timer);
+
+    cpuSoftmaxTreeFwd(weights.getData(), targets.getData(), numFeatures, *tree);
+
+    sdkStopTimer(&timer);
+    printf("CPU (partial) result:\n");
+    targets.print(0, 7, 0, 5);
+    printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
+
+    sdkResetTimer(&timer);
+    cudaDeviceSynchronize();
+    
+    nvWeights.transpose();
+    nvTargets.transpose();
+    sdkStartTimer(&timer);
+    
+    tree->makeWeights(nvWeights, nvTargets);
+    cudaDeviceSynchronize();
+    sdkStopTimer(&timer);
+    
+    nvWeights.transpose();
+    nvTargets.transpose();
+    printf("GPU (partial) result:\n");
+    nvTargets.print(0, 7, 0, 5);
+    printf("GPU time: %.6f msec\n", sdkGetTimerValue(&timer));
+    compareResults(targets, nvTargets, "targets");
+}
+
+void test_sftree_bwd() {
+    printf("===============================\n");
+    printf("test_sftree_bwd\n");
+    printf("===============================\n");
+
+    int numFeatures = 6*6*128;
+    int depth = 10;
+    SoftmaxTree* tree = makeDummyTree(depth);
+    cout << "numFeatures: " << numFeatures << endl;
+    cout << "depth: " << depth << endl;
+    cout << "numNodes: " << tree->getNumNodes() << endl;
+    cout << "numLabels: " << tree->getNumLeaves() << endl;
+    
+    Matrix grads(tree->getNumNodes(), numFeatures);
+    NVMatrix nvGrads(tree->getNumNodes(), numFeatures);
+
+    grads.randomizeUniform();
+
+    nvGrads.copyFromHost(grads);
+    
+    sdkResetTimer(&timer);
+    sdkStartTimer(&timer);
+
+    cpuSoftmaxTreeBwd(grads.getData(), numFeatures, *tree);
+
+    sdkStopTimer(&timer);
+    printf("CPU (partial) result:\n");
+    grads.print(0, 7, 0, 5);
+    printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
+
+    sdkResetTimer(&timer);
+    cudaDeviceSynchronize();
+    
+    nvGrads.transpose();
+    sdkStartTimer(&timer);
+    
+    tree->distributeGradients(nvGrads);
+    cudaDeviceSynchronize();
+    sdkStopTimer(&timer);
+    
+    nvGrads.transpose();
+    printf("GPU (partial) result:\n");
+    nvGrads.print(0, 7, 0, 5);
+    printf("GPU time: %.6f msec\n", sdkGetTimerValue(&timer));
+    compareResults(grads, nvGrads, "grads");
+}
+
+void test_sftree_update() {
+    printf("===============================\n");
+    printf("test_sftree_update\n");
+    printf("===============================\n");
+
+    float eps = 0.001, wc = 0.005, mom = 0.9;
+    int numFeatures = 6*6*128;
+    int depth = 10;
+    SoftmaxTree* tree = makeDummyTree(depth);
+    cout << "numFeatures: " << numFeatures << endl;
+    cout << "depth: " << depth << endl;
+    cout << "numNodes: " << tree->getNumNodes() << endl;
+    cout << "numLabels: " << tree->getNumLeaves() << endl;
+    
+    Matrix grads(tree->getNumNodes(), numFeatures);
+    Matrix weights(tree->getNumNodes(), numFeatures);
+    Matrix incs(tree->getNumNodes(), numFeatures);
+    NVMatrix nvGrads(tree->getNumNodes(), numFeatures);
+    NVMatrix nvWeights(tree->getNumNodes(), numFeatures);
+    NVMatrix nvIncs(tree->getNumNodes(), numFeatures);
+
+    grads.randomizeUniform();
+    weights.randomizeUniform();
+    incs.randomizeUniform();
+
+    nvGrads.copyFromHost(grads);
+    nvWeights.copyFromHost(weights);
+    nvIncs.copyFromHost(incs);
+    
+    sdkResetTimer(&timer);
+    sdkStartTimer(&timer);
+
+    cpuSoftmaxTreeUpdateWeights(weights.getData(), incs.getData(), grads.getData(), numFeatures, eps, mom, wc, *tree);
+
+    sdkStopTimer(&timer);
+    printf("CPU (partial) result:\n");
+    weights.print(0, 7, 0, 5);
+    printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
+
+    sdkResetTimer(&timer);
+    cudaDeviceSynchronize();
+    
+    nvGrads.transpose();
+    nvWeights.transpose();
+    nvIncs.transpose();
+    sdkStartTimer(&timer);
+    
+    tree->updateWeights(nvWeights, nvIncs, nvGrads, eps, mom, wc);
+    cudaDeviceSynchronize();
+    sdkStopTimer(&timer);
+    
+    nvGrads.transpose();
+    nvWeights.transpose();
+    nvIncs.transpose();
+    printf("GPU (partial) result:\n");
+    nvWeights.print(0, 7, 0, 5);
+    printf("GPU time: %.6f msec\n", sdkGetTimerValue(&timer));
+    compareResults(weights, nvWeights, "weights");
+    compareResults(incs, nvIncs, "incs");
+}
+
+int main(int argc, char** argv) {
+    
+    int boardNum = get_board_lock();
+    if (boardNum == GPU_LOCK_NO_BOARD) {
+        printf("No free GPU boards!\n");
+        exit(EXIT_FAILURE);
+    } else if(boardNum == GPU_LOCK_NO_SCRIPT) {
+        printf("Running on default board.\n");
+    } else {
+        printf("Running on board %d\n", boardNum);
+    }
+    init_tests(boardNum);
+    
+//    test_blattice();
+//    test_multiSoftmaxCPU();
+//    test_multiSoftmaxCPU_parallel();
+//    test_sftree_fwd();
+//    test_sftree_bwd();
+//    test_mdiag();
+//    test_mdiagGrad();
+    return 0;
+}
--- a/src/util.cu
+++ b/src/util.cu
@ -0,0 +1,124 @@
+/* 
+ * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <util.cuh>
+
+using namespace std;
+
+stringv* getStringV(PyObject* pyList) {
+    if (pyList == NULL) {
+        return NULL;
+    }
+    stringv* vec = new stringv(); 
+    for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
+        vec->push_back(string(PyString_AS_STRING(PyList_GET_ITEM(pyList, i))));
+    }
+    return vec;
+}
+
+floatv* getFloatV(PyObject* pyList) {
+    if (pyList == NULL) {
+        return NULL;
+    }
+    floatv* vec = new floatv(); 
+    for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
+        vec->push_back(PyFloat_AS_DOUBLE(PyList_GET_ITEM(pyList, i)));
+    }
+    return vec;
+}
+
+intv* getIntV(PyObject* pyList) {
+    if (pyList == NULL) {
+        return NULL;
+    }
+    intv* vec = new intv(); 
+    for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
+        vec->push_back(PyInt_AS_LONG(PyList_GET_ITEM(pyList, i)));
+    }
+    return vec;
+}
+
+int* getIntA(PyObject* pyList) {
+    if (pyList == NULL) {
+        return NULL;
+    }
+    int* arr = new int[PyList_GET_SIZE(pyList)];
+    for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
+        arr[i] = PyInt_AS_LONG(PyList_GET_ITEM(pyList, i));
+    }
+    return arr;
+}
+
+MatrixV* getMatrixV(PyObject* pyList) {
+    return getMatrixV(pyList, PyList_GET_SIZE(pyList));
+}
+
+MatrixV* getMatrixV(PyObject* pyList, int len) {
+    if (pyList == NULL) {
+        return NULL;
+    }
+    MatrixV* vec = new MatrixV(); 
+    for (int i = 0; i < len; i++) {
+        vec->push_back(new Matrix((PyArrayObject*)PyList_GET_ITEM(pyList, i)));
+    }
+    return vec;
+}
+
+int pyDictGetInt(PyObject* dict, const char* key) {
+    return PyInt_AS_LONG(PyDict_GetItemString(dict, key));
+}
+
+intv* pyDictGetIntV(PyObject* dict, const char* key) {
+    return getIntV(PyDict_GetItemString(dict, key));
+}
+
+int* pyDictGetIntA(PyObject* dict, const char* key) {
+    return getIntA(PyDict_GetItemString(dict, key));
+}
+
+string pyDictGetString(PyObject* dict, const char* key) {
+    return string(PyString_AS_STRING(PyDict_GetItemString(dict, key)));
+}
+
+float pyDictGetFloat(PyObject* dict, const char* key) {
+    return PyFloat_AS_DOUBLE(PyDict_GetItemString(dict, key));
+}
+
+floatv* pyDictGetFloatV(PyObject* dict, const char* key) {
+    return getFloatV(PyDict_GetItemString(dict, key));
+}
+
+Matrix* pyDictGetMatrix(PyObject* dict, const char* key) {
+    return new Matrix((PyArrayObject*)PyDict_GetItemString(dict, key));
+}
+
+MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key) {
+    return getMatrixV(PyDict_GetItemString(dict, key));
+}
+
+stringv* pyDictGetStringV(PyObject* dict, const char* key) {
+    return getStringV(PyDict_GetItemString(dict, key));
+}
--- a/src/weights.cu
+++ b/src/weights.cu
@ -0,0 +1,378 @@
+/* 
+ * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <algorithm>
+#include <weights.cuh>
+#include <softmaxtree.cuh>
+#include <lr.cuh>
+#include "worker.cuh"
+
+NVMatrix& Weights::operator*() const {
+    return getW();
+}
+
+Weights::Weights(Weights& srcWeights, LearningRateSchedule& lrs)
+    : _srcWeights(&srcWeights), _lrs(&lrs), _wc(0), _wball(0), _onGPU(false), _numUpdates(0),
+        _weights(NULL), _weightsInc(NULL), _weightsGrad(NULL), _cleanup(false) {
+    _hWeights = &srcWeights.getCPUW();
+    _hWeightsInc = &srcWeights.getCPUWInc();
+    _mom = srcWeights.getMom();
+    _useGrad = srcWeights.isUseGrad();   
+    _superEps = srcWeights.getSuperEps();
+}
+
+Weights::Weights(Matrix& hWeights, Matrix& hWeightsInc, LearningRateSchedule& lrs, float wc,
+                 float wball, float mom, float superEps, bool useGrad, bool cleanup)
+    : _srcWeights(NULL), _hWeights(&hWeights), _hWeightsInc(&hWeightsInc), _numUpdates(0),
+        _lrs(&lrs), _wc(wc), _wball(wball), _mom(mom), _useGrad(useGrad), _superEps(superEps),
+        _onGPU(false), _weights(NULL),_weightsInc(NULL), _weightsGrad(NULL), _cleanup(cleanup) {
+    assert(_superEps <= 0 || _useGrad); // superWeights ==> useGrad
+}
+
+Weights::~Weights() {
+	delete _lrs;
+    if (_cleanup) {
+        delete _hWeights;
+        delete _hWeightsInc;
+        if (_srcWeights == NULL) {
+            delete _weights;
+            delete _weightsInc;
+            delete _weightsGrad;
+        }
+    }
+}
+
+NVMatrix& Weights::getW() const {
+    assert(_onGPU);
+    return *_weights;
+}
+
+NVMatrix& Weights::getInc() const {
+    assert(_onGPU);
+    return *_weightsInc;
+}
+
+NVMatrix& Weights::getGrad() const {
+    assert(_onGPU);
+    return _useGrad ? *_weightsGrad : *_weightsInc;
+}
+
+Matrix& Weights::getCPUW() const {
+    return *_hWeights;
+}
+
+Matrix& Weights::getCPUWInc() const {
+    return *_hWeightsInc;
+}
+
+int Weights::getNumRows() const {
+    return _hWeights->getNumRows();
+}
+
+int Weights::getNumCols() const {
+    return _hWeights->getNumCols();
+}
+
+void Weights::copyToCPU() {
+    if (_srcWeights == NULL) {
+        assert(_onGPU);
+        _weights->copyToHost(*_hWeights);
+        _weightsInc->copyToHost(*_hWeightsInc);
+    }
+}
+
+// This function is assumed to be called in the order in which the layers
+// were defined
+void Weights::copyToGPU() {
+    assert(!_onGPU);
+    if (_srcWeights == NULL) {
+        _weights = _weights == NULL ? new NVMatrix() : _weights;
+        _weightsInc = _weightsInc == NULL ? new NVMatrix() : _weightsInc;
+        _weights->copyFromHost(*_hWeights, true);
+        _weightsInc->copyFromHost(*_hWeightsInc, true);
+        _weightsGrad = _useGrad ? (_weightsGrad == NULL ? new NVMatrix(*_weights) : _weightsGrad) : NULL;
+        _weightsGradAvg = _superEps > 0 ? new NVMatrix() : NULL;
+        _weightsGrad2Avg = _superEps > 0 ? new NVMatrix() : NULL;
+    } else {
+        _weights = _srcWeights->_weights;
+        _weightsInc = _srcWeights->_weightsInc;
+        _weightsGrad = _srcWeights->_weightsGrad;
+    }
+    _onGPU = true;
+}
+
+#define SUPERMOM_THREADS    256
+#define SUPERMOM_BLOCKS_MAX 4096
+
+/*
+ * V = eps * g / (G2 - G^2 + superEps)^.5 + mom * V
+ */
+__global__ void superMomUpdate(float* V, float* g, float* G, float* G2,
+                               const float eps, const float mom, const float superEps, const int numElements) {
+    const int tidx = blockIdx.x * SUPERMOM_THREADS + threadIdx.x;
+    
+    for (int t = tidx; t < numElements; t += gridDim.x * SUPERMOM_THREADS) {
+        V[t] = /*mom*/0.9 * V[t] + eps * __fdividef(g[t], sqrtf(G2[t] - G[t] + superEps));
+    }
+}
+
+// When _useGrad is false, weightsInc is assumed to contain the 
+// entire, properly scaled weight increment.
+// OTHERWISE, scale your gradient by 1 / numCases only.
+// The scaling by epsW will be done in this routine.
+void Weights::update(float progress) {
+    // Only true owner of weights updates
+    if (_srcWeights == NULL && _lrs->getBaseRate() > 0) {
+        assert(_onGPU);
+        if (_superEps <= 0) {
+            if (_useGrad) {
+                _weightsInc->add(*_weightsGrad, _mom, _lrs->getRate(progress));
+            }
+        } else {
+            if (!_weightsGradAvg->isSameDims(*_weightsGrad)) {
+                _weightsGradAvg->resize(*_weightsGrad);
+                _weightsGrad2Avg->resize(*_weightsGrad);
+                _weightsGradAvg->apply(NVMatrixOps::Zero());
+                _weightsGrad2Avg->apply(NVMatrixOps::Zero());
+            }
+            _weightsGradAvg->add(*_weightsGrad, _mom, 1 - _mom);
+            _weightsGrad2Avg->applyBinary(Grad2AvgOperator(_mom), *_weightsGrad);
+            // Geoff version
+
+            // Make sure all matrices are contiguous
+            assert(_weightsGrad->isContiguous());
+            assert(_weightsGradAvg->isContiguous());
+            assert(_weightsGrad2Avg->isContiguous());
+            assert(_weightsInc->isContiguous());
+            // Make sure they all have the same transposedness
+            assert(_weightsGrad->isTrans() == _weightsGradAvg->isTrans());
+            assert(_weightsGradAvg->isTrans() == _weightsGrad2Avg->isTrans());
+            assert(_weightsGrad2Avg->isTrans() == _weightsInc->isTrans());
+            // Make sure they all have the same sizes
+            assert(_weightsGrad->isSameDims(*_weightsGradAvg));
+            assert(_weightsGradAvg->isSameDims(*_weightsGrad2Avg));
+            assert(_weightsGrad2Avg->isSameDims(*_weightsInc));
+            
+            int numElements = _weights->getNumElements();
+            dim3 blocks(std::min(DIVUP(numElements, SUPERMOM_THREADS), SUPERMOM_BLOCKS_MAX));
+            dim3 threads(SUPERMOM_THREADS);
+            //float super =  _superEps + 1000000*_weightsGrad2Avg->sum() / numElements;
+            //printf("super: %f\n", super);
+            superMomUpdate<<<blocks, threads>>>(_weightsInc->getDevData(), _weightsGrad->getDevData(),
+                                                _weightsGradAvg->getDevData(), _weightsGrad2Avg->getDevData(),
+                                                _lrs->getRate(progress), _mom, _superEps, numElements);
+            getLastCudaError("superMomUpdate: Kernel execution failed");
+            //_weightsInc->print(4,4);
+            //_weightsGrad2Avg->print(5,5);exit(0);
+            // Ilya version
+        }
+        if (_wc > 0) {
+            _weightsInc->add(*_weights, -_wc * _lrs->getRate(progress));
+        }
+        _weights->add(*_weightsInc);
+        _numUpdates = 0;
+    }
+}
+
+int Weights::incNumUpdates() {
+    if (_srcWeights != NULL) {
+        return _srcWeights->incNumUpdates();
+    }
+    return _numUpdates++;
+}
+
+// Returns the number of times a gradient has been computed for this
+// weight matrix during the current pass (interval between two calls of update())
+// through the net. This number will only be greater than 1 if this weight matrix
+// is *shared* by multiple layers in the net.
+int Weights::getNumUpdates() const {
+    if (_srcWeights != NULL) {
+        return _srcWeights->getNumUpdates();
+    }
+    return _numUpdates;
+}
+
+float Weights::getEps(float progress) const {
+    return _lrs->getRate(progress);
+}
+
+float Weights::getMom() const {
+    return _mom;
+}
+
+float Weights::getWC() const {
+    return _wc;
+}
+
+float Weights::getWBall() const {
+    return _wball;
+}
+
+bool Weights::isUseGrad() const { // is good grammar
+    return _useGrad;
+}
+
+bool Weights::isOwner() const {
+    return _srcWeights == NULL;
+}
+
+float Weights::getSuperEps() const {
+    return _superEps;
+}
+
+LearningRateSchedule& Weights::getLearningRateSchedule() const {
+	return *_lrs;
+}
+
+/* 
+ * ===============
+ * TreeWeights
+ * ===============
+ */
+TreeWeights::TreeWeights(SoftmaxTree& tree, Matrix& hWeights, Matrix& hWeightsInc, LearningRateSchedule& lrs, float wcBase, float mom)
+ : _tree(&tree), Weights(hWeights, hWeightsInc, lrs, wcBase, 0, mom, 0, true) {
+    assert(hWeights.isTrans());
+    assert(hWeightsInc.isTrans());
+}
+
+NVMatrix& TreeWeights::getW() const {
+    return *_leafWeights;
+}
+
+NVMatrix& TreeWeights::getInc() const {
+    return *_leafInc;
+}
+
+NVMatrix& TreeWeights::getGrad() const {
+    return *_leafGrad;
+}
+
+NVMatrix& TreeWeights::getAllW() const {
+    return *_weights;
+}
+
+NVMatrix& TreeWeights::getAllInc() const {
+    return *_weightsInc;
+}
+
+NVMatrix& TreeWeights::getAllGrad() const {
+    return *_weightsGrad;
+}
+
+void TreeWeights::copyToGPU() {
+    assert(!_onGPU);
+    Weights::copyToGPU();
+    _tree->finalize();
+    _effWeights.resize(*_weights);
+    _leafWeights = &_effWeights.sliceCols(0, _tree->getNumLeaves());
+    _leafGrad = &_weightsGrad->sliceCols(0, _tree->getNumLeaves());
+    _leafInc = &_weightsInc->sliceCols(0, _tree->getNumLeaves());
+    assert(_leafWeights->isView());
+    makeWeights();
+}
+
+int TreeWeights::getNumRows() const {
+    return _tree->getNumNodes();
+}
+
+void TreeWeights::update(float progress) {
+     // Only true owner of weights updates
+    if (_lrs->getBaseRate() > 0) {
+        assert(_onGPU);
+        distributeGradients();
+        _tree->updateWeights(*_weights, *_weightsInc, *_weightsGrad, _lrs->getRate(progress), _mom, _wc);
+        makeWeights();
+        _numUpdates = 0;
+    }
+}
+
+void TreeWeights::makeWeights() {
+    _tree->makeWeights(*_weights, _effWeights);
+}
+
+void TreeWeights::distributeGradients() {
+    _tree->distributeGradients(*_weightsGrad);
+}
+
+/* 
+ * ===============
+ * DummyWeights
+ * ===============
+ */
+DummyWeights::DummyWeights(Matrix& hWeights, Matrix& hWeightsInc,
+                           NVMatrix& weights, NVMatrix& incs, NVMatrix& grads)
+ : Weights(hWeights, hWeightsInc, *new LearningRateSchedule(0), 0, 0, 0, 0, true, false) {
+    _onGPU = true;
+    _weights = &weights;
+    _weightsInc = &incs;
+    _weightsGrad = &grads;
+}
+
+/* 
+ * ===============
+ * WeightList
+ * ===============
+ */
+Weights& WeightList::operator[](const int idx) const {
+    return *_weightList[idx];
+}
+
+WeightList::~WeightList() {
+    for (int i = 0; i < _weightList.size(); i++) {
+        delete _weightList[i];
+    }
+}
+
+WeightList::WeightList() {
+}
+
+
+void WeightList::addWeights(Weights& w) {
+    _weightList.push_back(&w);
+}
+
+
+void WeightList::update(float progress) {
+    for (int i = 0; i < getSize(); i++) {
+        _weightList[i]->update(progress);
+    }
+}
+
+void WeightList::copyToCPU() {
+    for (int i = 0; i < getSize(); i++) {
+        _weightList[i]->copyToCPU();
+    }
+}
+
+void WeightList::copyToGPU() {
+    for (int i = 0; i < getSize(); i++) {
+        _weightList[i]->copyToGPU();
+    }
+}
+
+int WeightList::getSize() const {
+    return _weightList.size();
+}
--- a/src/worker.cu
+++ b/src/worker.cu
@ -0,0 +1,279 @@
+/* 
+ * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <algorithm>
+#include <util.cuh>
+#include <worker.cuh>
+
+using namespace std;
+
+/* 
+ * ====================
+ * WorkResult
+ * ====================
+ */
+WorkResult::WorkResult(WorkResult::RESULTS resultType, Cost& results) : _resultType(resultType), _results(&results) {
+}
+
+WorkResult::WorkResult(WorkResult::RESULTS resultType) : _resultType(resultType), _results(NULL) {
+}
+
+WorkResult::~WorkResult() {
+    delete _results; // delete NULL is ok
+}
+
+Cost& WorkResult::getResults() const {
+    return *_results;
+}
+
+WorkResult::RESULTS WorkResult::getResultType() const {
+    return _resultType;
+}
+
+/* 
+ * ====================
+ * Worker
+ * ====================
+ */
+Worker::Worker(ConvNet& convNet) : _convNet(&convNet) {
+}
+
+/* 
+ * ====================
+ * DataWorker
+ * ====================
+ */
+DataWorker::DataWorker(ConvNet& convNet, CPUData& data) : Worker(convNet), _data(&data) {
+    _dp = &convNet.getDataProvider();
+    _dp->setData(*_data);
+}
+
+DataWorker::~DataWorker() {
+    _dp->clearData();
+}
+
+/* 
+ * ====================
+ * TrainingWorker
+ * ====================
+ */
+TrainingWorker::TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test)
+    : DataWorker(convNet, data), _progress(progress), _test(test) {
+}
+
+// Need to setData here (as opposed to the constructor) because the constructor executes in
+// the original CPU thread, which is not the one with GPU access.
+void TrainingWorker::run() {
+	_convNet->setTrainingProgress(_progress);
+    Cost& batchCost = *new Cost(0);
+    for (int i = 0; i < _dp->getNumMinibatches(); i++) {
+        _convNet->fprop(i, _test ? PASS_TEST : PASS_TRAIN);
+        _convNet->getCost(batchCost);
+        
+        if (!_test) {
+            _convNet->bprop(PASS_TRAIN);
+            _convNet->updateWeights();
+        }
+    }
+    _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
+}
+
+/*
+ * ====================
+ * SyncWorker
+ * ====================
+ */
+SyncWorker::SyncWorker(ConvNet& convNet) : Worker(convNet) {
+}
+
+void SyncWorker::run() {
+    _convNet->copyToCPU();
+    _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::SYNC_DONE));
+}
+
+/* 
+ * ====================
+ * GradCheckWorker
+ * ====================
+ */
+GradCheckWorker::GradCheckWorker(ConvNet& convNet, CPUData& data) 
+    : DataWorker(convNet, data) {
+}
+
+void GradCheckWorker::run() {
+    _convNet->checkGradients();
+    exit(0);
+}
+
+/* 
+ * ====================
+ * MultiviewTestWorker
+ * ====================
+ */
+MultiviewTestWorker::MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* logregName) 
+    : DataWorker(convNet, data), _numViews(numViews), _cpuProbs(&cpuProbs), _logregName(logregName) {
+    assert(_data->getNumCases() % _numViews == 0);
+}
+
+MultiviewTestWorker::MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews) 
+    : DataWorker(convNet, data), _numViews(numViews), _cpuProbs(NULL), _logregName("") {
+    assert(_data->getNumCases() % _numViews == 0);
+}
+
+MultiviewTestWorker::~MultiviewTestWorker() {
+    delete _cpuProbs;
+}
+
+void MultiviewTestWorker::run() {
+    int numCasesReal = _dp->getNumCases() / _numViews;
+    int numMiniReal = DIVUP(numCasesReal, _dp->getMinibatchSize());
+    
+    Cost& batchCost = *new Cost(0);
+    for (int i = 0; i < numMiniReal; i++) {
+        for (int v = 0; v < _numViews; v++) {
+            CPUData& mini = _dp->getDataSlice(v * numCasesReal + i * _dp->getMinibatchSize(),
+                                              min((v + 1) * numCasesReal, v * numCasesReal + (i + 1) * _dp->getMinibatchSize()));
+            _convNet->fprop(mini, v == 0 ? PASS_MULTIVIEW_TEST_START : v == _numViews - 1 ? PASS_MULTIVIEW_TEST_END : PASS_MULTIVIEW_TEST);
+        }
+        if (_cpuProbs != NULL) {
+            LogregCostLayer& logregLayer = *dynamic_cast<LogregCostLayer*>(&_convNet->getLayer(_logregName));
+            cudaSetDevice(logregLayer.getDeviceID());
+            Matrix& miniProbs = _cpuProbs->sliceRows(i * _dp->getMinibatchSize(),
+                                                     min(numCasesReal, (i + 1) * _dp->getMinibatchSize()));
+            NVMatrix& acts = logregLayer.getProbsAccum();
+            NVMatrix acts_T;
+            acts.transpose(acts_T);
+            acts_T.copyToHost(miniProbs);
+            
+            delete &miniProbs;
+        }
+        _convNet->getCost(batchCost);
+    }
+    cudaDeviceSynchronize();
+    _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
+}
+
+/* 
+ * ====================
+ * FeatureWorker
+ * ====================
+ */
+FeatureWorker::FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames)
+    : DataWorker(convNet, data), _ftrs(&ftrs), _layerNames(&layerNames) {
+    assert(layerNames.size() == ftrs.size());
+    for (int i = 0; i < layerNames.size(); i++) {
+        assert(ftrs[i]->getNumRows() == data.getNumCases());
+        assert(!ftrs[i]->isTrans());
+    }
+}
+
+FeatureWorker::~FeatureWorker() {
+    for (int i = 0; i < _ftrs->size(); i++) {
+        delete _ftrs->at(i);
+    }
+    delete _ftrs;
+    delete _layerNames;
+}
+
+void FeatureWorker::run() {
+    
+    Cost& batchCost = *new Cost(0);
+    
+    for (int i = 0; i < _dp->getNumMinibatches(); i++) {
+        _convNet->fprop(i, PASS_FEATURE_GEN);
+        _convNet->getCost(batchCost);
+        for (int f = 0; f < _layerNames->size(); f++) {
+            Layer& ftrLayer = _convNet->getLayer(_layerNames->at(f));
+            int d = ftrLayer.getDeviceID();
+            cudaSetDevice(d);
+            Matrix& miniFtrs = _ftrs->at(f)->sliceRows(i * _dp->getMinibatchSize(),
+                                                       min(_dp->getNumCases(), (i + 1) * _dp->getMinibatchSize()));
+            NVMatrix& acts = ftrLayer.getActs();
+            NVMatrix acts_T;
+            if (acts.isTrans()) {
+                NVMatrix& soft_T = acts.getTranspose();
+                soft_T.transpose(acts_T);
+                delete &soft_T;
+            } else {
+                acts.transpose(acts_T);
+            }
+            acts_T.copyToHost(miniFtrs);
+            delete &miniFtrs;
+        }
+    }
+    cudaDeviceSynchronize();
+    _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
+}
+
+/* 
+ * ====================
+ * DataGradWorker
+ * ====================
+ */
+DataGradWorker::DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx)
+    : DataWorker(convNet, data), _dataGrads(&dataGrads), _dataLayerIdx(dataLayerIdx), _softmaxLayerIdx(softmaxLayerIdx) {
+    assert(dataGrads.getNumRows() == data.getNumCases());
+    assert(!dataGrads.isTrans());
+}
+
+DataGradWorker::~DataGradWorker() {
+    delete _dataGrads;
+}
+
+void DataGradWorker::run() {
+//    DataLayer& dataLayer = *dynamic_cast<DataLayer*>(&_convNet->getLayer(_dataLayerIdx));
+//    SoftmaxLayer& softmaxLayer = *dynamic_cast<SoftmaxLayer*>(&_convNet->getLayer(_softmaxLayerIdx));
+//    softmaxLayer.setDoLogregGrad(false);
+//    Cost& batchCost = *new Cost(0);
+//    for (int i = 0; i < _dp->getNumMinibatches(); i++) {
+//        _convNet->fprop(i, PASS_TEST);
+//        _convNet->getCost(batchCost);
+//        softmaxLayer.getActs().apply(NVMatrixOps::Log(), softmaxLayer.getActsGrad());
+//        
+//        softmaxLayer.getActsGrad().addScalar(1);
+//        softmaxLayer.getActsGrad().scale(-1);
+//        softmaxLayer.incRcvdBInputs();
+//        softmaxLayer.bprop(PASS_TEST);
+//        
+//        Matrix& miniDataGrads = _dataGrads->sliceRows(i * _dp->getMinibatchSize(),
+//                                                      min(_dp->getNumCases(), (i + 1) * _dp->getMinibatchSize()));
+//        NVMatrix& grads = dataLayer.getActsGrad();
+//        NVMatrix grads_T;
+//        if (grads.isTrans()) {
+//            NVMatrix& soft_T = grads.getTranspose();
+//            soft_T.transpose(grads_T);
+//            delete &soft_T;
+//        } else {
+//            grads.transpose(grads_T);
+//        }
+//        grads_T.copyToHost(miniDataGrads);
+//        delete &miniDataGrads;
+//        
+//        _convNet->reset();
+//    }
+//    cudaThreadSynchronize();
+//    _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
+}