Initial commit

This commit is contained in:
Laurent El Shafey 2024-12-10 08:56:11 -08:00
commit 9fdd561586
246 changed files with 58283 additions and 0 deletions

594
src/convnet.cu Normal file
View file

@ -0,0 +1,594 @@
/*
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <vector>
#include <iostream>
#include <string>
#include <set>
#include <map>
#include <nvmatrix.cuh>
#include <nvmatrix_operators.cuh>
#include <matrix.h>
#include <convnet.cuh>
#include <util.cuh>
using namespace std;
/*
* =======================
* ConvNet
* =======================
*/
ConvNet::ConvNet(PyObject* layerParams, intv& deviceIDs, vector<intv*>& deviceCPUs, int minibatchSize, int weightUpdateFreq) : Thread(false) {
_weightUpdateFreq = weightUpdateFreq;
_numBwdMiniPasses = 0;
_deviceIDs = &deviceIDs;
_deviceCPUs = &deviceCPUs;
_data = NULL;
_trainingProgress = 0;
_sync = new ThreadSynchronizer(deviceIDs.size() + 1);
seti pipeSet;
pipeSet.insert(deviceIDs.begin(), deviceIDs.end());
_pd = new PipeDispenserNonBlocking(pipeSet);
PyObject* layerList = PyDict_Values(layerParams);
// Data layers live on the manager thread (in CPU memory)
for (int i = 0; i < PyList_GET_SIZE(layerList); i++) {
PyObject* paramsDict = PyList_GET_ITEM(layerList, i);
string layerType = pyDictGetString(paramsDict, "type");
if (layerType == "data") {
DataLayer* d = new DataLayer(NULL, paramsDict);
_dataLayers.push_back(d);
_layerMap[d->getName()] = d;
}
}
// Initialize GPU worker threads
for (int d = 0; d < deviceIDs.size(); ++d) {
ConvNetGPU* cng = new ConvNetGPU(layerList, deviceIDs[d], *deviceCPUs[d], this);
_convNetThreads.push_back(cng);
for (map<string, Layer*>::iterator it = cng->getLayerMap().begin(); it != cng->getLayerMap().end(); ++it) {
_layerMap[it->first] = it->second;
}
}
// Connect forward/backward links in graph
for (map<string, Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
PyObject* paramsDict = PyDict_GetItemString(layerParams, it->first.c_str());
PyObject* inputList = PyDict_GetItemString(paramsDict, "inputs");
if (inputList != NULL) {
for (int i = 0; i < PyList_GET_SIZE(inputList); i++) {
string inputName = PyString_AsString(PyList_GetItem(inputList, i));
it->second->addPrev(_layerMap[inputName]);
_layerMap[inputName]->addNext(it->second);
}
}
}
_numFwdTerminal = 0;
// Execute post-initialization stuff
for (map<string, Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
it->second->postInit();
_numFwdTerminal += it->second->getNext().size() == 0; // Number of terminal nodes going forward
}
// Find and count the terminal nodes in the backward pass
set<string> visited, terminal;
for (int t = 0; t < _convNetThreads.size(); t++) {
vector<CostLayer*>& cl = _convNetThreads[t]->getCostLayers();
for (int c = 0; c < cl.size(); c++) {
findBwdTerminal(*cl[c], visited, terminal);
}
}
_numBwdTerminal = terminal.size();
// printf("num fwd terminals: %d, back terminals:\n", _numFwdTerminal);
// for (set<string>::iterator it = terminal.begin(); it != terminal.end(); ++it) {
// printf("%s\n", (*it).c_str());
// }
_dp = new DataProvider(minibatchSize);
Py_DECREF(layerList);
assert(_weightUpdateFreq > 0);
}
void ConvNet::findBwdTerminal(Layer& l, set<string>& visited, set<string> &terminal) {
if (visited.count(l.getName()) == 0) {
visited.insert(l.getName());
if (l.isGradConsumer()) {
bool hasPrevConsumer = false;
for (int i = 0; i < l.getPrev().size(); i++) {
hasPrevConsumer |= l.getPrev()[i]->isGradConsumer();
}
if (!hasPrevConsumer || !l.isGradProducer()) {
terminal.insert(l.getName());
l.setBwdTerminal(true);
} else if (l.isGradProducer()) {
for (int i = 0; i < l.getPrev().size(); i++) {
if (l.getPrev()[i]->isGradConsumer()) {
findBwdTerminal(*l.getPrev()[i], visited, terminal);
}
}
}
}
}
}
void* ConvNet::run() {
// The manager thread defaults to using the GPU of the first worker.
// Put more logic here if this is inappropriate.
NVMatrix::setDeviceID(_convNetThreads[0]->getDeviceID());
for (int t = 0; t < _convNetThreads.size(); t++) {
_convNetThreads[t]->start();
}
copyToGPU();
while (true) {
Worker* worker = _workerQueue.dequeue();
worker->run();
delete worker;
}
return NULL;
}
Queue<Worker*>& ConvNet::getWorkerQueue() {
return _workerQueue;
}
Queue<WorkResult*>& ConvNet::getResultQueue() {
return _resultQueue;
}
DataProvider& ConvNet::getDataProvider() {
return *_dp;
}
Layer& ConvNet::operator[](string& name) {
return *_layerMap[name];
}
Layer& ConvNet::getLayer(string& name) {
return *_layerMap[name];
}
void ConvNet::sendMessage(MESSAGES msg, bool sync) {
for (int i = 0; i < _convNetThreads.size(); i++) {
_convNetThreads[i]->enqueueMessage(new Message(msg));
if (sync) {
_convNetThreads[i]->enqueueMessage(new Message(SYNC));
}
}
if (sync) {
_sync->sync();
}
}
void ConvNet::copyToCPU() {
sendMessage(COPY_TO_CPU, true);
}
void ConvNet::copyToGPU() {
sendMessage(COPY_TO_GPU, false);
}
void ConvNet::updateWeights() {
sendMessage(UPDATE_WEIGHTS, true);
}
void ConvNet::reset() {
sendMessage(RESET, false);
}
void ConvNet::fprop(PASS_TYPE passType) {
assert(_data != NULL);
reset();
for (int i = 0; i < _dataLayers.size(); i++) {
_dataLayers[i]->startFprop(*_data, passType);
}
waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
}
void ConvNet::fprop(CPUData& data, PASS_TYPE passType) {
if (&data != _data) {
delete _data;
}
_data = &data;
fprop(passType);
}
void ConvNet::fprop(int miniIdx, PASS_TYPE passType) {
delete _data;
reset();
if (miniIdx == 0 || miniIdx != _dataLayers[0]->getBufferMinibatchIdx()) {
_data = &_dp->getMinibatch(miniIdx);
for (int i = 0; i < _dataLayers.size(); i++) {
_dataLayers[i]->startFprop(*_data, passType);
}
} else {
_data = _dataLayers[0]->getBufferData();
for (int i = 0; i < _dataLayers.size(); i++) {
_dataLayers[i]->startFpropFromBuffer(passType);
}
}
CPUData* nextData = miniIdx + 1 == _dp->getNumMinibatches() ? NULL : &_dp->getMinibatch(miniIdx + 1);
if (nextData != NULL) {
for (int i = 0; i < _dataLayers.size(); i++) {
_dataLayers[i]->setBuffer(*nextData, miniIdx + 1);
}
}
waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
}
void ConvNet::bprop(PASS_TYPE passType) {
// Weights are updated when this is zero
_numBwdMiniPasses = (_numBwdMiniPasses + 1) % _weightUpdateFreq;
for (int i = 0; i < _convNetThreads.size(); i++) {
_convNetThreads[i]->enqueueMessage(new BpropStartMessage(passType));;
}
waitForTerminals(_numBwdTerminal, BPROP_TERMINAL);
reset();
}
void ConvNet::waitForTerminals(int numMsgs, MESSAGES msg) {
int terminalsDone = 0;
while(terminalsDone++ < numMsgs) {
Message* m = _msgQueue.dequeue();
assert(m->getMessageType() == msg);
delete m;
}
}
// Same as getCost() but adds results to given cost and returns it
Cost& ConvNet::getCost(Cost& cost) {
Cost &tmp = getCost();
cost += tmp;
delete &tmp;
return cost;
}
Cost& ConvNet::getCost() {
Cost &tmp = *new Cost(_data->getNumCases());
for (int i = 0; i < _convNetThreads.size(); i++) {
Cost& tmp2 = _convNetThreads[i]->getCost(_data->getNumCases());
tmp |= tmp2;
delete &tmp2;
}
return tmp;
}
double ConvNet::getCostValue() {
Cost& cost = getCost();
double val = cost.getValue();
delete &cost;
return val;
}
Queue<Message*>& ConvNet::getMessageQueue() {
return _msgQueue;
}
int ConvNet::getDeviceID(int gpuIdx) {
if (gpuIdx < 0) {
return -1;
}
return _deviceIDs->at(gpuIdx);
}
intv& ConvNet::getDeviceIDs() {
return *_deviceIDs;
}
ThreadSynchronizer& ConvNet::getSync() {
return *_sync;
}
PipeDispenser& ConvNet::getPipeDispenser() {
return *_pd;
}
void ConvNet::syncWithChildren() {
sendMessage(SYNC, false);
_sync->sync();
}
int ConvNet::getWeightUpdateFreq() {
return _weightUpdateFreq;
}
int ConvNet::getNumBwdMiniPasses() {
return _numBwdMiniPasses;
}
int ConvNet::getMinibatchSize() {
return _dp->getMinibatchSize();
}
void ConvNet::setTrainingProgress(double progress) {
_trainingProgress = progress;
}
double ConvNet::getTrainingProgress() const {
return _trainingProgress;
}
/*
* Gradient checking stuff
*/
void ConvNet::checkGradients() {
_numFailures = 0;
_numTests = 0;
fprop(0, PASS_GC);
_baseErr = getCostValue();
bprop(PASS_GC);
for (map<string, Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
if (it->second->getDeviceID() >= 0) {
NVMatrix::setDeviceID(it->second->getDeviceID());
it->second->checkGradients();
}
}
NVMatrix::setDeviceID(_convNetThreads[0]->getDeviceID());
cout << "------------------------" << endl;
if (_numFailures > 0) {
cout << _numFailures << "/" << _numTests << " TESTS FAILED" << endl;
} else {
cout << "ALL " << _numTests << " TESTS PASSED" << endl;
}
}
/*
* name: weight matrix name
* eps: finite difference step
*/
bool ConvNet::checkGradient(const string& name, float eps, Weights& weights) {
Matrix numGrad(weights.getNumRows(), weights.getNumCols());
Matrix diff(numGrad);
numGrad.apply(Matrix::ZERO);
Matrix weightsCPU;
weights.getW().copyToHost(weightsCPU, true);
for(int i = 0; i < weights.getNumRows(); i++) {
for (int j = 0; j < weights.getNumCols(); j++) {
float v = weightsCPU(i,j);
weightsCPU(i,j) += eps;
weights.getW().copyFromHost(weightsCPU);
weightsCPU(i,j) = v;
fprop(PASS_GC);
double err = getCostValue();
numGrad(i,j) = (err - _baseErr) / (_data->getNumCases() * eps);
if (isnan(numGrad(i,j)) || isinf(numGrad(i,j))) {
cout << "Numerical computation produced nan or inf when checking '" << name << "': " << numGrad(i,j) << endl;
cout << "Consider reducing the sizes of the weights or finite difference steps." << endl;
cout << "Exiting." << endl;
exit(1);
}
weights.getW().copyFromHost(weightsCPU);
}
}
Matrix gradCPU;
weights.getGrad().copyToHost(gradCPU, true);
gradCPU.scale(-1.0 / _data->getNumCases());
float analNorm = gradCPU.norm();
float numNorm = numGrad.norm();
numGrad.subtract(gradCPU, diff);
float relErr = diff.norm() / analNorm;
bool fail = relErr >= GC_REL_ERR_THRESH;
if (fail || !GC_SUPPRESS_PASSES) {
cout << "========================" << endl;
printf("(%s) %s GRADIENT CHECK\n", fail ? "****FAIL****" : "PASS", name.c_str());
cout << "========================" << endl;
cout << "Analytic:" << endl;
gradCPU.print(0, 6, 0, 4);
cout << "Numeric:" << endl;
numGrad.print(0, 6, 0, 4);
printf("Analytic norm: %e\n", analNorm);
printf("Numeric norm: %e\n", numNorm);
printf("Relative error: %e\n", relErr);
}
_numTests++;
_numFailures += fail;
return fail;
}
/*
* =======================
* ConvNetGPU
* =======================
*/
ConvNetGPU::ConvNetGPU(PyObject* layerList, int deviceID, intv& deviceCPUs, ConvNet* convNet)
: Thread(false, deviceCPUs), _deviceID(deviceID), _convNet(convNet) {
try {
int numLayers = PyList_GET_SIZE(layerList);
for (int i = 0; i < numLayers; i++) {
PyObject* paramsDict = PyList_GET_ITEM(layerList, i);
int layerDeviceID = convNet->getDeviceID(pyDictGetInt(paramsDict, "gpu"));
if (layerDeviceID == _deviceID) {
initLayer(paramsDict);
}
}
} catch (string& s) {
cout << "Error creating ConvNet: " << s << endl;
exit(1);
}
}
void ConvNetGPU::initLayer(PyObject* paramsDict) {
string type = pyDictGetString(paramsDict, "type");
string name = pyDictGetString(paramsDict, "name");
if (type == "fc") {
_layerMap[name] = new FCLayer(this, paramsDict, false, true);
} else if (type == "treefc") {
_layerMap[name] = new TreeFCLayer(this, paramsDict);
} else if (type == "conv") {
_layerMap[name] = new ConvLayer(this, paramsDict);
} else if (type == "local") {
_layerMap[name] = new LocalUnsharedLayer(this, paramsDict);
} else if (type == "pool") {
_layerMap[name] = &PoolLayer::makePoolLayer(this, paramsDict);
} else if (type == "rnorm") {
_layerMap[name] = new ResponseNormLayer(this, paramsDict);
} else if (type == "cmrnorm") {
_layerMap[name] = new CrossMapResponseNormLayer(this, paramsDict);
} else if (type == "cnorm") {
_layerMap[name] = new ContrastNormLayer(this, paramsDict);
} else if (type == "softmax") {
_layerMap[name] = new SoftmaxLayer(this, paramsDict);
} else if (type == "eltsum") {
_layerMap[name] = new EltwiseSumLayer(this, paramsDict);
} else if (type == "eltmax") {
_layerMap[name] = new EltwiseMaxLayer(this, paramsDict);
} else if (type == "neuron") {
_layerMap[name] = new NeuronLayer(this, paramsDict);
} else if (type == "nailbed") {
_layerMap[name] = new NailbedLayer(this, paramsDict);
} else if (type == "blur") {
_layerMap[name] = new GaussianBlurLayer(this, paramsDict);
} else if (type == "href") {
_layerMap[name] = new HorizontalReflectionLayer(this, paramsDict);
} else if (type == "resize") {
_layerMap[name] = new ResizeLayer(this, paramsDict);
} else if (type == "rgb2yuv") {
_layerMap[name] = new RGBToYUVLayer(this, paramsDict);
} else if (type == "rgb2lab") {
_layerMap[name] = new RGBToLABLayer(this, paramsDict);
} else if (type == "rscale") {
_layerMap[name] = new RandomScaleLayer(this, paramsDict);
} else if (type == "concat") {
_layerMap[name] = new ConcatenationLayer(this, paramsDict);
} else if (type == "hs") {
_layerMap[name] = new HiddenSexLayer(this, paramsDict);
} else if (strncmp(type.c_str(), "cost.", 5) == 0) {
CostLayer *c = &CostLayer::makeCostLayer(this, type, paramsDict);
_layerMap[name] = c;
_costs.push_back(c);
} else {
throw string("Unknown layer type ") + type;
}
}
/*
* This executes in a new CPU thread so it's OK to initialize CUDA stuff here.
*/
void ConvNetGPU::initCuda() {
NVMatrix::setDeviceID(_deviceID);
checkCudaErrors(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
for (int i = 0; i < _convNet->getDeviceIDs().size(); i++) {
int d = _convNet->getDeviceID(i);
if (d != _deviceID) {
if (NVMatrix::canAccessDevice(_deviceID, d)) {
printf("Enabling peer access %d --> %d\n", NVMatrix::getDeviceID(), d);
checkCudaErrors(cudaDeviceEnablePeerAccess(d, 0));
} else {
printf("No peer access %d --> %d\n", _deviceID, d);
}
}
}
NVMatrix::initCublas();
NVMatrix::initRandom();
srand(time(0));
}
void* ConvNetGPU::run() {
initCuda();
while (true) {
Message* m = _msgQueue.dequeue();
if (m->getMessageType() == FPROP_READY) {
FpropMessage* msg = static_cast<FpropMessage*>(m);
_layerMap[msg->getToLayer()]->fprop(msg->getPassType());
} else if (m->getMessageType() == BPROP_READY) {
BpropMessage* msg = static_cast<BpropMessage*>(m);
_layerMap[msg->getToLayer()]->incRcvdBInputMsgs();
_layerMap[msg->getToLayer()]->bprop(msg->getPassType());
} else if (m->getMessageType() == BPROP_START) {
BpropStartMessage* msg = static_cast<BpropStartMessage*>(m);
for (int i = 0; i < _costs.size(); i++) {
dynamic_cast<Layer*>(_costs[i])->bprop(msg->getPassType());
}
} else if (m->getMessageType() == SYNC) {
_convNet->getSync().sync();
} else if (m->getMessageType() == COPY_TO_CPU) {
for (map<string,Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
it->second->copyToCPU();
}
} else if (m->getMessageType() == COPY_TO_GPU) {
for (map<string,Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
it->second->copyToGPU();
}
} else if (m->getMessageType() == RESET) {
for (map<string,Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
it->second->reset();
}
} else if (m->getMessageType() == UPDATE_WEIGHTS) {
for (map<string,Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
it->second->updateWeights();
}
} else if (m->getMessageType() == RUNME) {
RunMeMessage* msg = static_cast<RunMeMessage*>(m);
msg->run();
}
delete m;
}
return NULL;
}
Cost& ConvNetGPU::getCost(int numCases) {
return *new Cost(numCases, _costs);
}
Layer& ConvNetGPU::operator[](string& name) {
return *_layerMap[name];
}
Layer& ConvNetGPU::getLayer(string& name) {
return *_layerMap[name];
}
int ConvNetGPU::getDeviceID() {
return _deviceID;
}
Queue<Message*>& ConvNetGPU::getMessageQueue() {
return _msgQueue;
}
void ConvNetGPU::enqueueMessage(Message* msg) {
getMessageQueue().enqueue(msg);
}
vector<CostLayer*>& ConvNetGPU::getCostLayers() {
return _costs;
}
map<string, Layer*>& ConvNetGPU::getLayerMap() {
return _layerMap;
}
ConvNet& ConvNetGPU::getConvNet() {
return *_convNet;
}

126
src/cost.cu Normal file
View file

@ -0,0 +1,126 @@
/*
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <iostream>
#include <cost.cuh>
using namespace std;
/*
* =====================
* Cost
* =====================
*/
Cost::Cost(int numCases) : _numCases(numCases) {
}
Cost::Cost(int numCases, vector<CostLayer*>& costs) : _numCases(numCases) {
for (vector<CostLayer*>::iterator it = costs.begin(); it != costs.end(); ++it) {
_costMap[(*it)->getName()] = &(*it)->getCost();
_costCoeffMap[(*it)->getName()] = (*it)->getCoeff();
}
}
int Cost::getNumCases() {
return _numCases;
}
doublev& Cost::operator [](const string s) {
return *_costMap[s];
}
CostMap& Cost::getCostMap() {
return _costMap;
}
CostCoeffMap& Cost::getCostCoeffMap() {
return _costCoeffMap;
}
double Cost::getValue() {
double val = 0;
for (CostMap::iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
val += _costCoeffMap[it->first] * it->second->at(0);
}
return val;
}
Cost& Cost::operator += (Cost& er) {
CostMap& otherMap = er.getCostMap();
CostCoeffMap& otherCoeffMap = er.getCostCoeffMap();
for (CostMap::const_iterator it = otherMap.begin(); it != otherMap.end(); ++it) {
if (_costMap.count(it->first) == 0) {
_costMap[it->first] = new doublev();
_costCoeffMap[it->first] = otherCoeffMap[it->first];
}
vector<double>& myVec = *_costMap[it->first];
vector<double>& otherVec = *otherMap[it->first];
assert(myVec.size() == 0 || myVec.size() == otherVec.size());
for (int i = 0; i < otherVec.size(); i++) {
if (myVec.size() <= i) {
myVec.push_back(0);
}
myVec[i] += otherVec[i];
}
}
_numCases += er.getNumCases();
return *this;
}
// Merge costs in er into this cost
Cost& Cost::operator |= (Cost& er) {
assert(er.getNumCases() == getNumCases());
CostMap& otherMap = er.getCostMap();
CostCoeffMap& otherCoeffMap = er.getCostCoeffMap();
for (CostMap::const_iterator it = otherMap.begin(); it != otherMap.end(); ++it) {
assert(_costMap.count(it->first) == 0);
_costMap[it->first] = new doublev();
_costCoeffMap[it->first] = otherCoeffMap[it->first];
vector<double>& myVec = *_costMap[it->first];
vector<double>& otherVec = *otherMap[it->first];
myVec.insert(myVec.begin(), otherVec.begin(), otherVec.end());
}
return *this;
}
Cost& Cost::operator /= (const double v) {
for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
for (doublev::iterator it2 = it->second->begin(); it2 != it->second->end(); ++it2) {
*it2 /= v;
}
}
return *this;
}
Cost::~Cost() {
for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
delete it->second;
}
}

65
src/cpuCNN.cu Normal file
View file

@ -0,0 +1,65 @@
#include "softmaxtree.cuh"
/*
* weights: (numNodes, numFeatures)
* targets: (numNodes, numFeatures)
*
*/
void cpuSoftmaxTreeFwd(float* weights, float* targets, const int numFeatures, SoftmaxTree& tree) {
for (int d = 0; d <= tree.getDepth(); ++d) {
for (SoftmaxNodeV::iterator it = tree.getNodesAtDepth(d).begin(); it!= tree.getNodesAtDepth(d).end(); ++it) {
SoftmaxNode& node = **it;
SoftmaxNode* parent = node.getParent();
for (int f = 0; f < numFeatures; ++f) {
targets[node.getLabel() * numFeatures + f] = weights[node.getLabel() * numFeatures + f]
+ (parent == NULL ? 0 : targets[parent->getLabel() * numFeatures + f]);
}
}
}
}
/*
* grads: (numNodes, numFeatures)
*
*/
void cpuSoftmaxTreeBwd(float* grads, const int numFeatures, SoftmaxTree& tree) {
for (int h = 1; h <= tree.getHeight(); ++h) {
for (SoftmaxNodeV::iterator it = tree.getNodesAtHeight(h).begin(); it!= tree.getNodesAtHeight(h).end(); ++it) {
SoftmaxNode& node = **it;
for (int f = 0; f < numFeatures; ++f) {
grads[node.getLabel() * numFeatures + f] = 0;
}
for (SoftmaxNodeV::iterator itc = node.getChildren().begin(); itc!= node.getChildren().end(); ++itc) {
SoftmaxNode& child = **itc;
for (int f = 0; f < numFeatures; ++f) {
grads[node.getLabel() * numFeatures + f] += grads[child.getLabel() * numFeatures + f];
}
}
}
}
}
/*
* weights: (numNodes, numFeatures)
* weightsInc: (numNodes, numFeatures)
* weightsGrad: (numNodes, numFeatures)
* nodeSizes: numNodes-array whose ith element gives number of leaves under
* node with label i.
*/
void cpuSoftmaxTreeUpdateWeights(float* weights, float* weightsInc, float* weightsGrad,
const int numFeatures, float eps, const float mom, float wc, SoftmaxTree& tree) {
for (int d = 0; d <= tree.getDepth(); d++) {
for (SoftmaxNodeV::iterator it = tree.getNodesAtDepth(d).begin(); it!= tree.getNodesAtDepth(d).end(); ++it) {
SoftmaxNode& node = **it;
float w = wc / node.getSize();
float e = eps;// * sqrt(node.getSize());
for (int f = 0; f < numFeatures; ++f) {
weightsInc[node.getLabel() * numFeatures + f] = mom * weightsInc[node.getLabel() * numFeatures + f]
+ e * (weightsGrad[node.getLabel() * numFeatures + f] - w * weights[node.getLabel() * numFeatures + f]);
weights[node.getLabel() * numFeatures + f] += weightsInc[node.getLabel() * numFeatures + f];
}
}
}
}

98
src/data.cu Normal file
View file

@ -0,0 +1,98 @@
/*
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <algorithm>
#include <data.cuh>
#include <vector>
using namespace std;
DataProvider::DataProvider(int minibatchSize) :
_minibatchSize(minibatchSize), _hData(NULL) {
}
void DataProvider::clearData() {
delete _hData;
_hData = NULL;
}
void DataProvider::setData(CPUData& hData) {
// This is now deleted by the DataWorker's destructor
// delete _hData; // Delete old CPU matrices
_hData = &hData;
}
CPUData& DataProvider::getMinibatch(int idx) {
assert(idx >= 0 && idx < getNumMinibatches());
return getDataSlice(idx * _minibatchSize, (idx + 1) * _minibatchSize);
}
CPUData& DataProvider::getDataSlice(int startCase, int endCase) {
assert(_hData != NULL);
assert(_hData->getNumCases() > 0);
endCase = min(_hData->getNumCases(), endCase);
// TODO: maintain these matrices, no point re-creating them all the time
MatrixV& miniData = *new MatrixV();
for (int i = 0; i < _hData->getData().size(); i++) {
// NOTE: if hData is transposed, then the output minibatch matrix
// can be a view. No need to allocate new CPU memory here. Might
// want to look into optimizing that in the future, though it's
// unlikely to be a big deal.
if (_hData->isTrans()) {
miniData.push_back(&(*_hData)[i].sliceCols(startCase, endCase));
} else {
miniData.push_back(new Matrix());
(*_hData)[i].sliceCols(startCase, endCase, *miniData.back());
}
}
return *new CPUData(&miniData);
}
int DataProvider::getNumMinibatches() {
assert(_hData != NULL);
assert(_hData->getNumCases() > 0);
return DIVUP(_hData->getNumCases(), _minibatchSize);
}
int DataProvider::getMinibatchSize() {
return _minibatchSize;
}
int DataProvider::getNumCases() {
assert(_hData != NULL);
assert(_hData->getNumCases() > 0);
return _hData->getNumCases();
}
int DataProvider::getNumCasesInMinibatch(int idx) {
assert(_hData != NULL);
assert(_hData->getNumCases() > 0);
assert(idx >= 0 && idx < getNumMinibatches());
return min(_minibatchSize, max(0, _hData->getNumCases() - idx * _minibatchSize));
}

34
src/hostmem.cu Normal file
View file

@ -0,0 +1,34 @@
#include <hostmem.cuh>
PinnedHostMem::PinnedHostMem() : _numBytes(0), _data(NULL) {
}
PinnedHostMem::~PinnedHostMem() {
if (_numBytes > 0) {
checkCudaErrors(cudaFreeHost(_data));
}
}
void PinnedHostMem::resize(uint bytes) {
if (_numBytes != bytes) {
if (_numBytes > 0) {
checkCudaErrors(cudaFreeHost(_data));
}
checkCudaErrors(cudaHostAlloc(&_data, bytes, cudaHostAllocPortable));
_numBytes = bytes;
}
}
void PinnedHostMem::copyFrom(void* src, uint bytes) {
resize(bytes);
checkCudaErrors(cudaMemcpy(_data, src, bytes, cudaMemcpyDefault));
}
void PinnedHostMem::copyTo(void* dst) {
checkCudaErrors(cudaMemcpy(dst, _data, _numBytes, cudaMemcpyDefault));
}
void* PinnedHostMem::getData() {
return _data;
}

2002
src/layer.cu Normal file

File diff suppressed because it is too large Load diff

720
src/layer_kernels.cu Normal file
View file

@ -0,0 +1,720 @@
/*
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <assert.h>
#include <vector>
#include <layer_kernels.cuh>
//#define LOG(x) ((x) > 0.0 ? log(x) : -1000.0)
// Computes log(exp(x) + exp(y))
//#define LOGADD(x, y) ()
using namespace std;
/*
* E = -log(y_t)
* probs: (numOut, numCases)
* energies: (numOut, numCases)
* labels: (1, numCases)
* maxEnergies: (1, numCases)
* labelLogProbs: (1, numCases) (*out)
* correctProbs: (1, numCases) (*out)
* top5Probs: (1, numCases) (*out)
*
* target: (1, numCases)
*
* This routine uses energeis to determine top-1 score because they're more accurate than top-n
* probabilities, which have numerical errors in them.
*/
__global__ void kMultiSoftmaxCost_engs(float* probs, float* energies, float* labels, float* maxEnergies,
float* labelLogProbs, float* correctProbs, float* top5Probs,
const int numCases, const int numOut, const int setSize) {
const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
if (tx < numCases) {
const int label = int(labels[tx]);
const float maxe = maxEnergies[tx];
const float labelp = probs[label * numCases + tx];
const float labele = energies[label * numCases + tx];
labelLogProbs[tx] = __logf(labelp);
int numBiggerEnergies = 0, numEqualsEnergies = 0;
for (int i = 0; i < numOut; ++i) {
numBiggerEnergies += energies[i * numCases + tx] > labele;
numEqualsEnergies += energies[i * numCases + tx] == labele;
}
const int slotsLeft = setSize - numBiggerEnergies;
top5Probs[tx] = slotsLeft <= 0 ? 0 : (numEqualsEnergies <= slotsLeft ? 1 : float(slotsLeft) / numEqualsEnergies);
// if (numEqualsEnergies != 1) {
// printf("numEqualsEnergies: %d, labelp: %e, maxp: %e\n", numEqualsEnergies, labelp, maxe);
// }
correctProbs[tx] = labele != maxe ? 0.0f : 1.0f / float(numEqualsEnergies);
}
}
/*
* E = -log(y_t)
* probs: (numOut, numCases)
* labels: (1, numCases)
* maxEnergies: (1, numCases)
* labelLogProbs: (1, numCases) (*out)
* correctProbs: (1, numCases) (*out)
* top5Probs: (1, numCases) (*out)
*
* target: (1, numCases)
*
*/
__global__ void kMultiSoftmaxCost(float* probs, float* labels, float* maxProbs,
float* labelLogProbs, float* correctProbs, float* top5Probs,
const int numCases, const int numOut, const int setSize) {
const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
if (tx < numCases) {
const int label = int(labels[tx]);
const float maxp = maxProbs[tx];
const float labelp = probs[label * numCases + tx];
labelLogProbs[tx] = __logf(labelp);
int numBiggerProbs = 0, numEqualsProbs = 0;
for (int i = 0; i < numOut; ++i) {
numBiggerProbs += probs[i * numCases + tx] > labelp;
numEqualsProbs += probs[i * numCases + tx] == labelp;
}
const int slotsLeft = setSize - numBiggerProbs;
top5Probs[tx] = slotsLeft <= 0.0f ? 0.0f : (numEqualsProbs <= slotsLeft ? 1.0f : float(slotsLeft) / numEqualsProbs);
correctProbs[tx] = labelp != maxp ? 0.0f : 1.0f / float(numEqualsProbs);
}
}
/*
* E = -log(y_t)
* probs: (numOut, numCases)
* labels: (1, numCases)
* maxProbs: (1, numCases)
* labelLogProbs: (1, numCases) (*out)
* correctProbs: (1, numCases) (*out)
* top5Probs: (1, numCases) (*out)
*
* target: (1, numCases) == log(y_l[labels,:]
*/
void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& energies, NVMatrix& labelLogProbs_out,
NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize, bool useEnergies) {
int numCases = probs.getNumCols();
int numOut = probs.getNumRows();
assert(labels.getNumElements() == numCases);
assert(!labels.isTrans());
assert(!probs.isTrans());
assert(labels.isContiguous());
assert(probs.isContiguous());
assert(energies.isContiguous());
assert(energies.isSameDims(probs));
// NVMatrix& maxProbs = probs.max(0);
NVMatrix& maxPE = useEnergies ? energies.max(0) : probs.max(0);
labelLogProbs_out.resize(1, numCases);
correctProbs_out.resize(1, numCases);
top5Probs_out.resize(1, numCases);
dim3 threads(LOGREG_ERR_THREADS_X, 1);
dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
if (useEnergies) {
cudaFuncSetCacheConfig(kMultiSoftmaxCost_engs, cudaFuncCachePreferL1);
kMultiSoftmaxCost_engs<<<blocks, threads>>>(probs.getDevData(), energies.getDevData(), labels.getDevData(), maxPE.getDevData(),
labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), top5Probs_out.getDevData(),
numCases, numOut, setSize);
} else {
cudaFuncSetCacheConfig(kMultiSoftmaxCost, cudaFuncCachePreferL1);
kMultiSoftmaxCost<<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), maxPE.getDevData(),
labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), top5Probs_out.getDevData(),
numCases, numOut, setSize);
}
getLastCudaError("computeLogregCost: Kernel execution failed");
// cudaThreadSynchronize();
delete &maxPE;
}
/*
* energies: (numCases, numOut) (yes this is weird)
* bLattice: (numOut + 1, setSize, numCases) (*out)
*
* This is intended to work for cases when setSize <= 32.
* Block size (y, x) = (1, B_X)
*
* NOTE:
* B_X must be a multiple of 32
*/
template <int B_X>
__global__ void kMSMBackward(float* energies, float* bLattice, const int numCases, const int numOut, const int setSize) {
extern __shared__ float shmem[];
const int tidx = blockIdx.x * B_X + threadIdx.x;
const int casesPerWarp = 32 / setSize;
const int casesPerBlock = casesPerWarp * B_X / 32;
const int numWorkersPerWarp = casesPerWarp * setSize;
const int tidxInWarp = tidx % 32;
const int warpIdx = tidx / 32;
const int blockCaseIdx = blockIdx.x * casesPerBlock;
const int caseIdxInBlock = threadIdx.x / setSize;
const int caseIdx = warpIdx * casesPerWarp + tidxInWarp / setSize;
const bool doWork = tidxInWarp < numWorkersPerWarp && caseIdx < numCases;
const int bIdx = threadIdx.x + threadIdx.x/setSize + 1;
volatile float* B = shmem;
volatile float* shE = &shmem[B_X + casesPerBlock]; // Dimensions (casesPerBlock, 32 + 1)
const int loadY = warpIdx;
const int loadX = tidxInWarp;
energies += (blockCaseIdx + loadY) * numOut + loadX;
bLattice += tidx;
if (blockIdx.x != 0) {
return;
}
// The first row of the lattice has a 1 in the columns corresponding to
// zero set size, 0 elsewhere.
for (int t = threadIdx.x; t < B_X + casesPerBlock; t += B_X) {
B[t] = t % setSize == 0;
}
for (int l = 0; l < numOut / 32; ++l) { // Load 32 energies at a time for casesPerBlock cases
__syncthreads();
// Load energies into shmem
for (int r = 0; r < casesPerBlock && blockCaseIdx + loadY + r < numCases; r += B_X / 32) {
shE[(r + loadY) * (32 + 1) + loadX] = __expf(energies[r * numOut]);
printf("%f\n", energies[r * numOut]);
}
__syncthreads();
// Compute 32 rows of the lattice
if (doWork) {
#pragma unroll
for (int i = 0; i < 32; ++i) {
B[bIdx] = B[bIdx - 1] * shE[caseIdxInBlock * (32 + 1) + i] + B[bIdx];
bLattice[i * numCases * setSize] = B[bIdx];
// printf("thread %d wrote %d to idx %d\n", tidx, B[bIdx], bIdx);
}
}
printf("thread %d made it\n", tidx);
bLattice += 32 * numCases * setSize;
}
// if (numOut % 32 != 0) {
// __syncthreads();
//
// }
}
/*
* energies: (numCases, numOut) (yes this is weird)
* bLattice: (numOut + 1, setSize, numCases) (*out)
*/
void MSMBackward(NVMatrix& energies, NVMatrix& bLattice, int setSize) {
int numCases = energies.getNumRows();
int numOut = energies.getNumCols();
assert(!energies.isTrans());
assert(!bLattice.isTrans());
assert(energies.isContiguous());
assert(energies.isContiguous());
bLattice.resize((numOut + 1) * setSize, numCases);
int B_X = 32;
int casesPerBlock = B_X / setSize;
int shmem = 4*(B_X + casesPerBlock + casesPerBlock * (32 + 1));
dim3 threads(B_X, 1);
dim3 blocks(DIVUP(numCases*setSize, B_X), 1);
printf("allocating %d words of shmem\n", shmem/4);
cudaFuncSetCacheConfig(kMSMBackward<32>, cudaFuncCachePreferShared);
kMSMBackward<32><<<blocks, threads, shmem>>>(energies.getDevData(), bLattice.getDevData(),
numCases, numOut, setSize);
getLastCudaError("kMSMBackward: Kernel execution failed");
}
/*
* E = sum(p_l * log(y_l))
* probs: (numOut, numCases)
* labels: (numOut, numCases)
* maxProbs: (1, numCases)
* labelLogProbs: (1, numCases) (*out)
* correctProbs: (1, numCases) (*out)
*
* target: (1, numCases)
*/
__global__ void kCrossEntCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
const int numCases, const int numOut) {
const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
if (tx < numCases) {
probs += tx;
labels += tx;
maxProbs += tx;
labelLogProbs += tx;
correctProbs += tx;
const float maxp = maxProbs[0];
/*
* Compute the probability of guessing the correct case if you take the most-probable label.
*
* This is done like this:
*
* - If the most probable label is not equal to the true label, then the probability is zero.
* - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
*
* This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
* maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
* Though it could never happen in reality. Well it could. But it wouldn't. Cool?
*/
float crossEnt = 0.0f;
int numMax = 0;
bool correctLabel = false;
for (int i = 0; i < numOut; i++) {
const float label_prob = labels[i * numCases];
const float model_prob = probs[i * numCases];
numMax += model_prob == maxp;
crossEnt += label_prob * safelog(model_prob);
correctLabel |= model_prob == maxp && label_prob > 0.0f;
}
labelLogProbs[0] = crossEnt;
if (!correctLabel) {
correctProbs[0] = 0.0f;
} else {
correctProbs[0] = 1.0f / float(numMax);
}
}
}
/*
* E = sum(p_l * log(y_l))
* y_l: (numOut, numCases)
* labels: (numOut, numCases)
*
* dE_dy_l: (numOut, numCases)
*/
template <bool add>
__global__ void kCrossEntGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
const int numOut, const float gradCoeff) {
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
const int tidx = ty * numCases + tx;
if (ty < numOut && tx < numCases) {
const float label_prob = labels[tidx];
const float model_prob = y_l[tidx];
const float v = gradCoeff * __fdividef(label_prob, model_prob);
if (add) {
dE_dy_l[tidx] += v;
} else {
dE_dy_l[tidx] = v;
}
}
}
/*
* E = sum(p_l * log(y_l))
* y_l: (numOut, numCases)
* labels: (numOut, numCases)
*
* dE_dx_l: (numOut, numCases)
*/
template <bool add>
__global__ void kCrossEntSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
const int numOut, const float gradCoeff) {
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
const int tidx = ty * numCases + tx;
if (ty < numOut && tx < numCases) {
float v = 0;
const float model_prob = y_l[tidx];
for (int j = 0; j < numOut; j++) {
const float label_prob = labels[j * numCases + tx];
v += label_prob * ((j == ty) - model_prob);
}
v *= gradCoeff;
if (add) {
dE_dx_l[tidx] += v;
} else {
dE_dx_l[tidx] = v;
}
}
}
/*
* E = -log(y_t)
* probs: (numOut, numCases)
* labels: (1, numCases)
* maxProbs: (1, numCases)
* labelLogProbs: (1, numCases) (*out)
* correctProbs: (1, numCases) (*out)
*
* target: (1, numCases)
*/
__global__ void kLogregCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
const int numCases, const int numOut) {
const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
if (tx < numCases) {
const int label = int(labels[tx]);
const float maxp = maxProbs[tx];
const float labelp = probs[label * numCases + tx];
labelLogProbs[tx] = __logf(labelp);
/*
* Compute the probability of guessing the correct case if you take the most-probable label.
*
* This is done like this:
*
* - If the most probable label is not equal to the true label, then the probability is zero.
* - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
*
* This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
* maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
* Though it could never happen in reality. Well it could. But it wouldn't. Cool?
*/
if (labelp != maxp) {
correctProbs[tx] = 0;
} else {
int numMax = 0;
for (int i = 0; i < numOut; i++) {
numMax += probs[i * numCases + tx] == maxp;
}
correctProbs[tx] = 1.0f / float(numMax);
}
}
}
/*
* E = -log(y_t)
* y_l: (numOut, numCases)
* labels: (1, numCases)
*
* dE_dy_l: (numOut, numCases)
*/
template <bool add>
__global__ void kLogregCostGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
const int numOut, const float gradCoeff) {
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
const int tidx = ty * numCases + tx;
if (ty < numOut && tx < numCases) {
const int label = int(labels[tx]);
float v = gradCoeff * (label == ty);
v = __fdividef(v, y_l[tidx]);
if (add) {
dE_dy_l[tidx] += v;
} else {
dE_dy_l[tidx] = v;
}
}
}
/*
* E = -log(y_t)
* y_l: (numOut, numCases)
* labels: (1, numCases)
*
* dE_dx_l: (numOut, numCases)
*/
template <bool add>
__global__ void kLogregSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
const int numOut, const float gradCoeff) {
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
const int tidx = ty * numCases + tx;
if (ty < numOut && tx < numCases) {
const int label = int(labels[tx]);
float v = gradCoeff * ((label == ty) - y_l[tidx]);
if (add) {
dE_dx_l[tidx] += v;
} else {
dE_dx_l[tidx] = v;
}
}
}
/*
* dE_dy_l: (numOut, numCases)
* y_l: (numOut, numCases)
*
* dE_dx_l: (numOut, numCases)
*/
template <bool add>
__global__ void kSoftmaxGrad(float* dE_dy_l, float* y_l, float* dE_dx_l, const int numCases, const int numOut) {
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
const int tidx = ty * numCases + tx;
if (ty < numOut && tx < numCases) {
float v = 0;
for (int j = 0; j < numOut; j++) {
v += dE_dy_l[j * numCases + tx] * ((j == ty) - y_l[j * numCases + tx]);
}
v *= y_l[tidx];
if (add) {
dE_dx_l[tidx] += v;
} else {
dE_dx_l[tidx] = v;
}
}
}
template <int B_X, bool add>
__global__ void kEltwiseMaxGrad(float* actGrad, float* input, float* output, float* target,
const int numElements) {
for (int i = B_X * blockIdx.x + threadIdx.x; i < numElements; i += B_X * gridDim.x) {
if (add) {
target[i] += actGrad[i] * (output[i] == input[i]);
} else {
target[i] = actGrad[i] * (output[i] == input[i]);
}
}
}
void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add) {
assert(actGrad.isContiguous());
assert(output.isContiguous());
assert(input.isContiguous());
assert(actGrad.isSameDims(input));
assert(actGrad.isSameDims(output));
dim3 blocks(DIVUP(actGrad.getNumElements(), 128));
dim3 threads(128);
if (add) {
assert(actGrad.isSameDims(target));
cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, true>, cudaFuncCachePreferL1);
kEltwiseMaxGrad<128, true><<<blocks, threads>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
} else {
target.resize(actGrad);
cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, false>, cudaFuncCachePreferL1);
kEltwiseMaxGrad<128, false><<<blocks, threads>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
}
getLastCudaError("computeEltwiseMaxGrad: Kernel execution failed");
}
/*
* E = sum_i{-p_i*log(y_i)}
* probs: (numOut, numCases)
* labels: (numOut, numCases)
* maxProbs: (1, numCases)
* labelLogProbs: (1, numCases) (*out)
* correctProbs: (1, numCases) (*out)
*
* target: (1, numCases)
*/
void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
int numCases = probs.getNumCols();
int numOut = probs.getNumRows();
assert(labels.isSameDims(probs));
assert(!labels.isTrans());
assert(!probs.isTrans());
assert(labels.isContiguous());
assert(probs.isContiguous());
NVMatrix& maxProbs = probs.max(0);
labelLogProbs_out.resize(1, numCases);
correctProbs_out.resize(1, numCases);
dim3 threads(LOGREG_ERR_THREADS_X, 1);
dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
cudaFuncSetCacheConfig(kCrossEntCost, cudaFuncCachePreferL1);
kCrossEntCost<<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
numCases, numOut);
getLastCudaError("kCrossEntCost: Kernel execution failed");
delete &maxProbs;
}
void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
int numCases = probs.getLeadingDim();
int numOut = probs.getFollowingDim();
assert(labels.isSameDims(probs));
assert(probs.isContiguous());
assert(target.isContiguous());
assert(labels.isContiguous());
assert(!labels.isTrans());
assert(!probs.isTrans());
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
if (!add) {
target.resize(probs);
kCrossEntGrad<false><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
} else {
kCrossEntGrad<true><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
}
getLastCudaError("kCrossEntGrad: Kernel execution failed");
}
void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, bool add) {
int numCases = acts.getLeadingDim();
int numOut = acts.getFollowingDim();
assert(acts.isSameDims(actsGrad));
assert(acts.isContiguous());
assert(actsGrad.isContiguous());
assert(target.isContiguous());
assert(acts.isTrans());
assert(actsGrad.isTrans());
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
if (!add) {
target.resize(acts);
kSoftmaxGrad<false><<<blocks, threads>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut);
} else {
kSoftmaxGrad<true><<<blocks, threads>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut);
}
getLastCudaError("computeSoftmaxGrad: Kernel execution failed");
}
void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
int numCases = probs.getLeadingDim();
int numOut = probs.getFollowingDim();
assert(labels.getLeadingDim() == probs.getLeadingDim() && labels.getFollowingDim() == probs.getFollowingDim());
assert(probs.isContiguous());
assert(target.isContiguous());
assert(labels.isContiguous());
assert(probs.isTrans());
assert(!labels.isTrans());
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
if (!add) {
target.resize(probs);
cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<false>, cudaFuncCachePreferL1);
kCrossEntSoftmaxGrad<false><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
} else {
cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<true>, cudaFuncCachePreferL1);
kCrossEntSoftmaxGrad<true><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
}
getLastCudaError("kCrossEntSoftmaxGrad: Kernel execution failed");
}
/*
* E = -log(y_t)
* probs: (numOut, numCases)
* labels: (1, numCases)
* maxProbs: (1, numCases)
* labelLogProbs: (1, numCases) (*out)
* correctProbs: (1, numCases) (*out)
*
* target: (1, numCases) == log(y_l[labels,:]
*/
void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
int numCases = probs.getNumCols();
int numOut = probs.getNumRows();
assert(labels.getNumElements() == numCases);
assert(!labels.isTrans());
assert(!probs.isTrans());
assert(labels.isContiguous());
assert(probs.isContiguous());
NVMatrix& maxProbs = probs.max(0);
labelLogProbs_out.resize(1, numCases);
correctProbs_out.resize(1, numCases);
dim3 threads(LOGREG_ERR_THREADS_X, 1);
dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
cudaFuncSetCacheConfig(kLogregCost, cudaFuncCachePreferL1);
kLogregCost<<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
numCases, numOut);
getLastCudaError("computeLogregCost: Kernel execution failed");
// cudaThreadSynchronize();
delete &maxProbs;
}
void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
int numCases = probs.getLeadingDim();
int numOut = probs.getFollowingDim();
assert(labels.getNumElements() == numCases);
assert(probs.isContiguous());
assert(target.isContiguous());
assert(labels.isContiguous());
assert(!labels.isTrans());
assert(!probs.isTrans());
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
if (!add) {
target.resize(probs);
kLogregCostGrad<false><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
} else {
kLogregCostGrad<true><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
}
getLastCudaError("computeLogregGrad: Kernel execution failed");
}
void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
int numCases = probs.getLeadingDim();
int numOut = probs.getFollowingDim();
assert(labels.getNumElements() == numCases);
assert(probs.isContiguous());
assert(target.isContiguous());
assert(labels.isContiguous());
assert(probs.isTrans());
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
if (!add) {
target.resize(probs);
kLogregSoftmaxGrad<false><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
} else {
kLogregSoftmaxGrad<true><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
numCases, numOut, coeff);
}
getLastCudaError("computeLogregSoftmaxGrad: Kernel execution failed");
}

186
src/lr.cu Normal file
View file

@ -0,0 +1,186 @@
#include <string>
#include <lr.cuh>
#include <util.cuh>
using namespace std;
/*
* ==================================
* LearningRateSchedule
* ==================================
*/
LearningRateSchedule& LearningRateSchedule::make(PyObject* lrsDict, double baseRate) {
string type = pyDictGetString(lrsDict, "type");
if (type == "default") {
return *new LearningRateSchedule(baseRate, 0);
} else {
PyObject* paramsDict = PyDict_GetItemString(lrsDict, "params");
double tgtFactor = pyDictGetFloat(paramsDict, "tgtFactor");
double noiseStdev = pyDictGetFloat(paramsDict, "noiseStdev");
if (type == "linear") {
return *new LinearLRS(baseRate, tgtFactor, noiseStdev);
} else if (type == "exp") {
return *new ExpLRS(baseRate, tgtFactor, noiseStdev);
} else if (type == "dexp") {
double numSteps = pyDictGetInt(paramsDict, "numSteps");
return *new DiscreteExpLRS(baseRate, tgtFactor, noiseStdev, numSteps);
} else if (type == "jdexp") {
double numSteps = pyDictGetInt(paramsDict, "numSteps");
return *new JumpyDiscreteExpLRS(baseRate, tgtFactor, noiseStdev, numSteps);
}
}
throw string("Unknown learning rate schedule type ") + type;
}
LearningRateSchedule::LearningRateSchedule(double baseRate, double noiseStdev)
: _baseRate(baseRate), _noiseStdev(noiseStdev), _haveRandnSpare(false), _randnSpare(0) {
}
LearningRateSchedule::LearningRateSchedule(double baseRate)
: _baseRate(baseRate), _noiseStdev(0), _haveRandnSpare(false), _randnSpare(0) {
}
double LearningRateSchedule::getRate(double progress) {
return _noiseStdev > 0 ? _getRate(progress) * (1 + abs(randn()) * _noiseStdev)
: _getRate(progress);
}
double LearningRateSchedule::_getRate(double progress) {
return _baseRate;
}
inline double LearningRateSchedule::randn() {
if (!_haveRandnSpare) {
double T = 2 * 3.1415 * rand();
double R = std::sqrt(-2 * std::log(rand()));
_randnSpare = R * std::sin(T);
_haveRandnSpare = true;
return R * std::cos(T);
}
_haveRandnSpare = false;
return _randnSpare;
}
// This should never generate zero
inline double LearningRateSchedule::rand() const {
return double(1L + random()) / (1L + RAND_MAX);
}
inline double LearningRateSchedule::abs(double x) const {
return x > 0 ? x : -x;
}
double LearningRateSchedule::getBaseRate() const {
return _baseRate;
}
LearningRateSchedule::~LearningRateSchedule() {
}
/*
* ==================================
* LinearLRS
* ==================================
*/
LinearLRS::LinearLRS(double baseRate, double tgtFactor, double noiseStdev)
: LearningRateSchedule(baseRate, noiseStdev) {
_finalRate = baseRate / tgtFactor;
}
double LinearLRS::_getRate(double progress) {
return _baseRate * (1 - progress) + _finalRate * progress;
}
/*
* ==================================
* ExpLRS
* ==================================
*/
ExpLRS::ExpLRS(double baseRate, double tgtFactor, double noiseStdev)
: LearningRateSchedule(baseRate, noiseStdev) {
double finalRate = baseRate / tgtFactor;
_pow = baseRate == 0 ? 1 : (std::log(finalRate) / std::log(baseRate) - 1);
}
double ExpLRS::_getRate(double progress) {
return std::pow(_baseRate, 1.0 + progress * _pow);
}
/*
* ==================================
* TanhLRS
* ==================================
*/
TanhLRS::TanhLRS(double baseRate, double tgtFactor, double noiseStdev)
: LearningRateSchedule(baseRate, noiseStdev), _alpha(0), _beta(0) {
if (baseRate > 0) {
double finalRate = baseRate / tgtFactor;
_beta = 0.5 * (baseRate + finalRate);
_alpha = 2 * atanh((baseRate - finalRate) / (baseRate + finalRate));
}
}
double TanhLRS::_getRate(double progress) {
return _beta * (tanh(-_alpha * (progress - 0.5)) + 1.0);
}
/*
* ==================================
* DiscreteExpLRS
* ==================================
*/
DiscreteExpLRS::DiscreteExpLRS(double baseRate, double tgtFactor, double noiseStdev, int numSteps)
: LearningRateSchedule(baseRate, noiseStdev) {
ExpLRS elrs(baseRate, tgtFactor, 0);
double finalRate = baseRate / tgtFactor;
for (int i = 0; i < numSteps - 1; i++) {
double progress = double(i) / (numSteps - 1);
_rates.push_back(elrs._getRate(progress));
}
_rates.push_back(finalRate);
//printf("initialized base %e, final %e, stpes %d\n", baseRate, finalRate, numSteps);
}
double DiscreteExpLRS::_getRate(double progress) {
for (int i = 0; i < _rates.size(); ++i) {
if (progress <= double(i + 1) / _rates.size()) {
return _rates[i];
}
}
return _rates.back();
}
/*
* ==================================
* JumpyDiscreteExpLRS
* ==================================
*/
JumpyDiscreteExpLRS::JumpyDiscreteExpLRS(double baseRate, double tgtFactor, double noiseStdev, int numSteps)
: DiscreteExpLRS(baseRate, tgtFactor, noiseStdev, numSteps) {
}
double JumpyDiscreteExpLRS::_getRate(double progress) {
int rateIdx = 0;
for (int i = 0; i < _rates.size(); ++i) {
if (progress <= double(i + 1) / _rates.size()) {
rateIdx = i;
break;
}
}
// The midpoint of the interval that progress falls into.
double intervalMid = double(rateIdx + 0.5) / _rates.size();
// Jumpy learning rate works like this:
// If progress is before the midpoint of the current interval,
// it returns the same learning rate as would DiscreteExpLRS.
// Else,
// it returns the learning rate of the *previous* interval (provided there is one).
// rateIdx -= rateIdx > 0 && progress > 0.2 && progress < 0.9 && progress > intervalMid;
// Uncomment this (and comment line above) to use variant 2:
// Instead of using the learning rate of the previous interval, this uses
// the geometric average of the learning rates of the current and previous
// intervals.
bool jump = rateIdx > 0 && progress > 0.2 && progress < 0.9 && progress > intervalMid;
return jump ? sqrt(_rates[rateIdx] * _rates[rateIdx - 1]) : _rates[rateIdx];
// return _rates[rateIdx];
}

126
src/multisoftmax.cpp Normal file
View file

@ -0,0 +1,126 @@
#include <assert.h>
//#include <mathimf.h>
#include <multisoftmax.h>
using namespace std;
// Computes log(exp(x) + exp(y))
inline double logadd(const double x, const double y) {
if (x <= -INF && y <= -INF) {
return -INF;
}
const double M = max(x,y);
const double m = min(x,y);
const double diff = M - m;
// return diff > 15 ? M : M + LOG(1.0 + EXP(-diff));
// return m <= -INF ? M : M + LOG(1.0f + EXP(-diff));
return diff > 15 ? M : (diff > 5 ? M + EXP(-diff) : M + LOG(1.0 + EXP(-diff)));
}
/*
* elts: (numCases, numOut)
* B: (N + 1, size + 1) -- batckward lattice matrix, MUST BE initially -INF
* fixed: (numCases, 1)
* probs: (numCases, numOut) (*out)
*
* double precision is much faster than single. :/
*/
void MultiSoftmaxCPU_T_logspace(Matrix& elts, Matrix& logB, Matrix& probs, Matrix& fixed, int size, bool nofix) {
int numCases = elts.getNumRows();
assert(probs.isSameDims(elts));
assert(!elts.isTrans());
assert(!logB.isTrans());
assert(!probs.isTrans());
assert(fixed.getNumRows() == numCases);
assert(fixed.getNumCols() == 1);
int N = elts.getNumCols();
Matrix& logF = *new Matrix(size + 1, 1); // Forward column
// Prepare logB
logB(N, 0) = 0;
for (int c = 0; c < numCases; ++c) {
int fx = nofix ? -1 : int(fixed(c, 0));
// Backward pass
for (int i = N - 1; i >= 0; --i) {
double elt = elts(c, i);
logB(i, 0) = i <= fx ? -INF : 0.0f;
for (int s = max(1, size - i); s < size + 1; ++s) {
logB(i, s) = fx == i ? logB(i + 1, s - 1) + elt : logadd(logB(i + 1, s - 1) + elt, logB(i + 1, s));
}
}
// Log partition function
double logZ = logB(0, size);
// Forward pass
logF.apply(Matrix::ONE);
logF.scale(-INF);
logF(0, 0) = 0;
for (int i = 1; i < N + 1; ++i) {
double logy = -INF;
double elt = elts(c, i - 1);
for (int s = size; s >= 0; --s) {
if (s < size) {
logy = logadd(logy, logF(s, 0) + logB(i, size - 1 - s));
}
if (s > 0) {
logF(s, 0) = fx == i - 1 ? logF(s - 1, 0) + elt : logadd(logF(s - 1, 0) + elt, logF(s, 0));
} else if (fx == i - 1) {
logF(0, 0) = -INF;
}
}
logy += elt - logZ;
probs(c, i - 1) = EXP(logy) - (fx >= 0 ? probs(c, i - 1) : 0);
}
}
delete &logF;
}
MultiSoftmaxWorker::MultiSoftmaxWorker(Matrix* elts, Matrix* B, Matrix* probs, Matrix* fixed, int size, bool nofix)
: Thread(true), _elts(elts), _B(B), _probs(probs), _fixed(fixed), _size(size), _nofix(nofix) {
}
MultiSoftmaxWorker::~MultiSoftmaxWorker() {
delete _elts;
delete _probs;
delete _fixed;
}
void* MultiSoftmaxWorker::run() {
MultiSoftmaxCPU_T_logspace(*_elts, *_B, *_probs, *_fixed, _size, _nofix);
return NULL;
}
/*
* elts: (numCases, numOut)
* B: vector of (N + 1, size + 1) -- batckward lattice matrix, should be initially zero
* fixed: (numCases, 1)
* probs: (numCases, numOut) (*out)
*
* NOTE: remember to write a version of this for transposed matrices.
* It may end up being significantly faster, which is important if
* I plan to use CPU for this.
*/
void MultiSoftmaxCPU_T_parallel(Matrix& elts, vector<Matrix*>& B, Matrix& probs, Matrix& fixed, int size, bool nofix) {
int numCases = elts.getNumRows();
int numWorkers = min(numCases, (int)B.size());
probs.resize(elts);
int casesPerWorker = DIVUP(numCases, B.size());
numWorkers = min(numWorkers, DIVUP(numCases, casesPerWorker));
vector<Thread*> workers;
for (int i = 0; i < numWorkers; ++i) {
Matrix* eltSlice = &elts.sliceRows(i * casesPerWorker, min(elts.getNumRows(), (long int)(i + 1) * casesPerWorker));
Matrix* probSlice = &probs.sliceRows(i * casesPerWorker, min(elts.getNumRows(), (long int)(i + 1) * casesPerWorker));
Matrix* fixedSlice = &fixed.sliceRows(i * casesPerWorker, min(elts.getNumRows(), (long int)(i + 1) * casesPerWorker));
workers.push_back(new MultiSoftmaxWorker(eltSlice, B[i], probSlice, fixedSlice, size, nofix));
workers[i]->start();
}
for (int i = 0; i < numWorkers; ++i) {
workers[i]->join();
delete workers[i];
}
}

85
src/neuron.cu Normal file
View file

@ -0,0 +1,85 @@
/*
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <neuron.cuh>
#include <util.cuh>
using namespace std;
Neuron& Neuron::makeNeuron(PyObject* neuronDict) {
string type = pyDictGetString(neuronDict, "type");
PyObject* neuronParamsDict = PyDict_GetItemString(neuronDict, "params");
if (type == "relu") {
return *new ReluNeuron();
}
if (type == "nrelu") {
return *new NoisyReluNeuron();
}
if (type == "drelu") {
return *new DoubleReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
}
if (type == "softrelu") {
return *new SoftReluNeuron();
}
if (type == "brelu") {
return *new BoundedReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
}
if (type == "abs") {
return *new AbsNeuron();
}
if (type == "logistic") {
return *new LogisticNeuron();
}
if (type == "tanh") {
return *new TanhNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
}
if (type == "square") {
return *new SquareNeuron();
}
if (type == "sqrt") {
return *new SqrtNeuron();
}
if (type == "linear") {
return *new LinearNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
}
if (type == "ident") {
return *new Neuron();
}
throw string("Unknown neuron type: ") + type;
}

242
src/pyconvnet.cu Normal file
View file

@ -0,0 +1,242 @@
/*
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <Python.h>
#include <arrayobject.h>
#include <assert.h>
#include <helper_cuda.h>
#include <cublas.h>
#include <time.h>
#include <vector>
#include <matrix.h>
#include <queue.h>
#include <worker.cuh>
#include <util.cuh>
#include <cost.cuh>
#include <pyconvnet.cuh>
#include <convnet.cuh>
using namespace std;
static ConvNet* model = NULL;
static PyMethodDef _ConvNetMethods[] = {{ "initModel", initModel, METH_VARARGS },
{ "startBatch", startBatch, METH_VARARGS },
{ "finishBatch", finishBatch, METH_VARARGS },
{ "checkGradients", checkGradients, METH_VARARGS },
{ "startMultiviewTest", startMultiviewTest, METH_VARARGS },
{ "startFeatureWriter", startFeatureWriter, METH_VARARGS },
{ "startDataGrad", startDataGrad, METH_VARARGS },
{ "syncWithHost", syncWithHost, METH_VARARGS },
{ NULL, NULL }
};
#if defined(_WIN64) || defined(_WIN32)
extern "C" __declspec(dllexport) void initpyconvnet() {
(void) Py_InitModule("pyconvnet", _ConvNetMethods);
import_array();
}
#else
void INITNAME() {
(void) Py_InitModule(QUOTEME(MODELNAME), _ConvNetMethods);
import_array();
}
#endif
PyObject* initModel(PyObject *self, PyObject *args) {
assert(model == NULL);
PyDictObject* pyLayerParams;
PyListObject* pyDeviceIDs, *pyDeviceCPUs;
int pyMinibatchSize;
int pyWeightUpdateFreq;
if (!PyArg_ParseTuple(args, "O!O!O!ii",
&PyDict_Type, &pyLayerParams,
&PyList_Type, &pyDeviceIDs,
&PyList_Type, &pyDeviceCPUs,
&pyMinibatchSize,
&pyWeightUpdateFreq)) {
return NULL;
}
intv& deviceIDs = *getIntV((PyObject*)pyDeviceIDs);
vector<intv*>& deviceCPUs = *new vector<intv*>();
for (int i = 0; i < PyList_GET_SIZE(pyDeviceCPUs); i++) {
intv* v = getIntV(PyList_GetItem((PyObject*)pyDeviceCPUs, i));
deviceCPUs.push_back(v);
}
model = new ConvNet((PyObject*)pyLayerParams,
deviceIDs,
deviceCPUs,
pyMinibatchSize,
pyWeightUpdateFreq);
model->start();
return Py_BuildValue("i", 0);
}
/*
* Starts training/testing on the given batch (asynchronous -- returns immediately).
*/
PyObject* startBatch(PyObject *self, PyObject *args) {
assert(model != NULL);
PyListObject* data;
double progress;
int test = 0;
if (!PyArg_ParseTuple(args, "O!d|i",
&PyList_Type, &data,
&progress,
&test)) {
return NULL;
}
CPUData* cpuData = new CPUData((PyObject*)data);
TrainingWorker* wr = new TrainingWorker(*model, *cpuData, progress, test);
model->getWorkerQueue().enqueue(wr);
return Py_BuildValue("i", 0);
}
/*
* Starts testing on the given batch (asynchronous -- returns immediately).
*/
PyObject* startMultiviewTest(PyObject *self, PyObject *args) {
assert(model != NULL);
PyListObject* data;
int numViews;
PyArrayObject* pyProbs = NULL;
char* logregName = NULL;
if (!PyArg_ParseTuple(args, "O!i|O!s",
&PyList_Type, &data,
&numViews,
&PyArray_Type, &pyProbs,
&logregName)) {
return NULL;
}
CPUData* cpuData = new CPUData((PyObject*)data);
MultiviewTestWorker* wr = pyProbs == NULL ? new MultiviewTestWorker(*model, *cpuData, numViews)
: new MultiviewTestWorker(*model, *cpuData, numViews, *new Matrix(pyProbs), logregName);
model->getWorkerQueue().enqueue(wr);
return Py_BuildValue("i", 0);
}
PyObject* startFeatureWriter(PyObject *self, PyObject *args) {
assert(model != NULL);
PyListObject* data;
PyListObject* pyFtrs;
PyListObject* pyLayerNames;
if (!PyArg_ParseTuple(args, "O!O!O!",
&PyList_Type, &data,
&PyList_Type, &pyFtrs,
&PyList_Type, &pyLayerNames)) {
return NULL;
}
stringv* layerNames = getStringV((PyObject*)pyLayerNames);
CPUData* cpuData = new CPUData((PyObject*)data);
MatrixV* ftrs = getMatrixV((PyObject*)pyFtrs);
FeatureWorker* wr = new FeatureWorker(*model, *cpuData, *ftrs, *layerNames);
model->getWorkerQueue().enqueue(wr);
return Py_BuildValue("i", 0);
}
PyObject* startDataGrad(PyObject *self, PyObject *args) {
// assert(model != NULL);
// PyListObject* data;
// int dataLayerIdx, softmaxLayerIdx;
// if (!PyArg_ParseTuple(args, "O!ii",
// &PyList_Type, &data,
// &dataLayerIdx, &softmaxLayerIdx)) {
// return NULL;
// }
// CPUData* cpuData = new CPUData((PyObject*)data);
// Matrix& ftrs = *mvec.back();
// mvec.pop_back();
//
// DataGradWorker* wr = new DataGradWorker(*model, *cpuData, ftrs, dataLayerIdx, softmaxLayerIdx);
// model->getWorkerQueue().enqueue(wr);
return Py_BuildValue("i", 0);
}
/*
* Waits for the trainer to finish training on the batch given to startBatch.
*/
PyObject* finishBatch(PyObject *self, PyObject *args) {
assert(model != NULL);
WorkResult* res = model->getResultQueue().dequeue();
assert(res != NULL);
assert(res->getResultType() == WorkResult::BATCH_DONE);
Cost& cost = res->getResults();
PyObject* dict = PyDict_New();
CostMap& costMap = cost.getCostMap();
for (CostMap::const_iterator it = costMap.begin(); it != costMap.end(); ++it) {
PyObject* v = PyList_New(0);
for (vector<double>::const_iterator iv = it->second->begin(); iv != it->second->end(); ++iv) {
PyObject* f = PyFloat_FromDouble(*iv);
PyList_Append(v, f);
}
PyDict_SetItemString(dict, it->first.c_str(), v);
}
PyObject* retVal = Py_BuildValue("Ni", dict, cost.getNumCases());
delete res; // Deletes cost too
return retVal;
}
PyObject* checkGradients(PyObject *self, PyObject *args) {
assert(model != NULL);
PyListObject* data;
if (!PyArg_ParseTuple(args, "O!",
&PyList_Type, &data)) {
return NULL;
}
CPUData* cpuData = new CPUData((PyObject*)data);
GradCheckWorker* wr = new GradCheckWorker(*model, *cpuData);
model->getWorkerQueue().enqueue(wr);
WorkResult* res = model->getResultQueue().dequeue();
assert(res != NULL);
assert(res->getResultType() == WorkResult::BATCH_DONE);
delete res;
return Py_BuildValue("i", 0);
}
/*
* Copies weight matrices from GPU to system memory.
*/
PyObject* syncWithHost(PyObject *self, PyObject *args) {
assert(model != NULL);
SyncWorker* wr = new SyncWorker(*model);
model->getWorkerQueue().enqueue(wr);
WorkResult* res = model->getResultQueue().dequeue();
assert(res != NULL);
assert(res->getResultType() == WorkResult::SYNC_DONE);
delete res;
return Py_BuildValue("i", 0);
}

65
src/quantizer.cu Normal file
View file

@ -0,0 +1,65 @@
#include <quantizer.cuh>
using namespace std;
/*=================
* Quantizer
* ================
*/
Quantizer& Quantizer::make(PyObject* lrsDict) {
string type = pyDictGetString(lrsDict, "type");
if (type == "default") {
return *new Quantizer();
} else if (type == "half") {
return *new HalfQuantizer();
}
throw string("Unknown quantizer type ") + type;
}
Quantizer::Quantizer() : _numRows(0), _numCols(0), _trans(false) {
}
Quantizer::~Quantizer() {
}
void Quantizer::quantize(NVMatrix& src, NVMatrix& tgt) {
_quantize(src, tgt);
_quantized = &tgt;
_numRows = src.getNumRows();
_numCols = src.getNumCols();
_trans = src.isTrans();
}
void Quantizer::dequantize(NVMatrix& tgt, float scaleTarget, float scaleOutput) {
_dequantize(tgt, scaleTarget, scaleOutput);
tgt.setTrans(_trans);
tgt.reshape(_numRows, _numCols);
}
void Quantizer::dequantize(NVMatrix& tgt) {
dequantize(tgt, 0, 1);
}
void Quantizer::_quantize(NVMatrix& src, NVMatrix& tgt) {
src.copy(tgt);
}
void Quantizer::_dequantize(NVMatrix& tgt, float scaleTarget, float scaleOutput) {
tgt.add(*_quantized, scaleTarget, scaleOutput);
}
/*=================
* HalfQuantizer
* ================
*/
HalfQuantizer::HalfQuantizer() : Quantizer() {
}
void HalfQuantizer::_quantize(NVMatrix& src, NVMatrix& tgt) {
convQuantizeHalf(src, tgt);
}
void HalfQuantizer::_dequantize(NVMatrix& tgt, float scaleTarget, float scaleOutput) {
convDequantizeHalf(*_quantized, tgt, _numRows * _numCols, scaleTarget, scaleOutput);
}

441
src/softmaxtree.cu Normal file
View file

@ -0,0 +1,441 @@
#include <softmaxtree.cuh>
#include "layer.cuh"
using namespace std;
/*
* This launches a series of blocks for every node at a given depth.
* The "series" just spans the length of the weight vectors.
*
* The operation performed is (loosely):
* targets[d] := weights[d] + targets[d-1]
*
* Block size: (y, x) = (1, B_X)
* Grid size: (y, x) = (numNodesAtDepth, ceil(numFeatures/B_X))
*
* weights: (numNodes, numFeatures)
* nodes: numNodesAtDepth-length array of ushort2
* where x coordinate gives node idx and y coordinate gives parent idx
* targets: (numNodes, numFeatures)
*
*/
template<int B_X,bool root>
__global__ void kSoftmaxTreeFwd(float* weights, ushort2* nodes, float* targets, const int numFeatures) {
__shared__ ushort2 node; // node.x == node idx, node.y == parent node idx
const int depthNodeIdx = blockIdx.y;
const int featureOffset = blockIdx.x * B_X + threadIdx.x;
if (threadIdx.x == 0) {
node = nodes[depthNodeIdx];
}
__syncthreads();
weights += featureOffset;
targets += featureOffset;
// No loops for now
if (featureOffset < numFeatures) {
if (root) {
targets[node.x * numFeatures] = weights[numFeatures * node.x];
} else {
targets[node.x * numFeatures] = targets[node.y * numFeatures] + weights[numFeatures * node.x];
}
}
}
/*
* This launches a series of blocks for every node at a given height.
* The "series" just spans the length of the weight vectors.
*
* The operation performed is (loosely):
* grads[h] := sum_d{grads[h-1]}
*
* Block size: (y, x) = (1, B_X)
* Grid size: (y, x) = (numNodesAtHeight, ceil(numFeatures/B_X))
*
* grads: (numNodes, numFeatures)
* nodes: numNodesAtHeight-length array of ushort2
* where x coordinate gives node idx and y coordinate gives NUMBER OF CHILDREN
* ^ (note difference with kSoftmaxTreeFwd)
* childrenPtrs: numNodesAtHeight-length array of pointers to children indices
*
* The idea is to start one of these grids at each height, in sequence, starting
* from height = 1.
*
* The rows 0-numLabels-1 of grads must already have the correct softmax gradients (these
* are the nodes at height = 0).
*
*/
template<int B_X>
__global__ void kSoftmaxTreeBwd(float* grads, ushort2* nodes, ushort** childrenPtrs, const int numFeatures) {
__shared__ ushort2 node; // node.x == node idx, node.y == parent node idx
__shared__ ushort* childrenPtr;
__shared__ ushort children[B_X];
const int heightNodeIdx = blockIdx.y;
const int featureOffset = blockIdx.x * B_X + threadIdx.x;
if (threadIdx.x == 0) {
node = nodes[heightNodeIdx];
childrenPtr = childrenPtrs[heightNodeIdx];
}
__syncthreads();
grads += featureOffset;
const int nodeIdx = node.x;
const int numChildren = node.y;
float nodeGrad = 0;
for (int c = 0; c < numChildren; c += B_X) {
if (c + threadIdx.x < numChildren) {
children[threadIdx.x] = childrenPtr[c + threadIdx.x];
}
__syncthreads();
if (featureOffset < numFeatures) {
const int numChildrenLeft = min(B_X, numChildren - c);
for (int cc = 0; cc < numChildrenLeft; ++cc) {
const int childIdx = children[cc];
//const int childIdx = childrenPtr[c + cc];
nodeGrad += grads[childIdx * numFeatures];
}
}
__syncthreads();
}
if (featureOffset < numFeatures) {
grads[nodeIdx * numFeatures] = nodeGrad;
}
}
/*
*
* Block size: (y, x) = (1, B_X)
* Grid size: (y, x) = (1, numNodes)
*
* weights: (numNodes, numFeatures)
* weightsInc: (numNodes, numFeatures)
* weightsGrad: (numNodes, numFeatures)
* nodeSizes: numNodes-array whose ith element gives number of leaves under
* node with label i.
*
* TODO: why did I make nodeSizes ushort? int would prolly be fine.
*/
template<int B_X>
__global__ void kSoftmaxTreeUpdateWeights(float* weights, float* weightsInc, float* weightsGrad,
ushort* nodeSizes, const int numFeatures,
float eps, const float mom, float wc) {
__shared__ int nodeSize; // node.x == node idx, node.y == parent node idx
const int nodeIdx = blockIdx.x;
if (threadIdx.x == 0) {
nodeSize = nodeSizes[nodeIdx];
}
__syncthreads();
weights += nodeIdx * numFeatures;
weightsInc += nodeIdx * numFeatures;
weightsGrad += nodeIdx * numFeatures;
// TODO: make these shared?
// eps *= sqrtf(nodeSize);
wc /= nodeSize;
eps /= nodeSize; // larger epsw at the leaves
for (int f = threadIdx.x; f < numFeatures; f += B_X) {
const float inc = mom * weightsInc[f] + eps * (weightsGrad[f] - wc * weights[f]);
weightsInc[f] = inc;
weights[f] += inc;
}
}
/*
* ==================
* SoftmaxNode
* ==================
*/
int SoftmaxNode::setDistances(std::map<int, SoftmaxNodeV*>& nodeHeights,
std::map<int, SoftmaxNodeV*>& nodeDepths) {
_height = 0;
for (SoftmaxNodeV::iterator it = _children.begin(); it != _children.end(); ++it) {
_height = max(_height, (*it)->setDistances(nodeHeights, nodeDepths));
}
_height += _children.size() > 0;
if (nodeHeights.count(_height) == 0) {
nodeHeights[_height] = new SoftmaxNodeV();
}
if (nodeDepths.count(_depth) == 0) {
nodeDepths[_depth] = new SoftmaxNodeV();
}
nodeHeights[_height]->push_back(this);
nodeDepths[_depth]->push_back(this);
return _height;
}
void SoftmaxNode::setNodeCounts(int &nodes, int& leaves) {
nodes++;
leaves += _children.size() == 0;
for (SoftmaxNodeV::iterator it = _children.begin(); it != _children.end(); ++it) {
(*it)->setNodeCounts(nodes, leaves);
}
}
int SoftmaxNode::setSizes(ushort* nodeSizes) {
_size = _children.size() == 0;
for (SoftmaxNodeV::iterator it = _children.begin(); it != _children.end(); ++it) {
_size += (*it)->setSizes(nodeSizes);
}
nodeSizes[_label] = _size;
return _size;
}
SoftmaxNode::SoftmaxNode(SoftmaxNode* parent, int label)
: _parent(parent), _label(label), _size(0), _height(0) {
_depth = parent == NULL ? 0 : parent->getDepth() + 1;
}
SoftmaxNode::~SoftmaxNode() {
for (SoftmaxNodeV::iterator it = _children.begin(); it != _children.end(); ++it) {
delete *it;
}
}
int SoftmaxNode::getDepth() const {
return _depth;
}
int SoftmaxNode::getHeight() const {
return _height;
}
int SoftmaxNode::getSize() const {
return _size;
}
int SoftmaxNode::getLabel() const {
return _label;
}
SoftmaxNode* SoftmaxNode::getParent() {
return _parent;
}
SoftmaxNodeV& SoftmaxNode::getChildren() {
return _children;
}
SoftmaxNode& SoftmaxNode::addChild(int label) {
_children.push_back(new SoftmaxNode(this, label));
return *_children.back();
}
/*
* ==================
* SoftmaxTree
* ==================
*/
SoftmaxTree::SoftmaxTree(int rootLabel) {
_root = new SoftmaxNode(NULL, rootLabel);
_nodeSizes = NULL;
_numNodes = 0;
_numLeaves = 0;
}
SoftmaxTree::~SoftmaxTree() {
checkCudaErrors(cudaFreeHost(_nodeSizes));
for (map<int, SoftmaxNodeV*>::iterator it = _nodeHeights.begin(); it != _nodeHeights.end(); ++it) {
int height = it->first;
SoftmaxNodeV& nodes = *it->second;
for (int n = 0; n < nodes.size(); n++) {
checkCudaErrors(cudaFreeHost(_nodeChildMeta[height][n]));
}
checkCudaErrors(cudaFreeHost(_nodeChildMeta[height]));
checkCudaErrors(cudaFreeHost(_nodeChildMeta[height]));
delete &nodes;
}
for (map<int, SoftmaxNodeV*>::iterator it = _nodeDepths.begin(); it != _nodeDepths.end(); ++it) {
SoftmaxNodeV& nodes = *it->second;
int depth = it->first;
checkCudaErrors(cudaFreeHost(_nodeFwdMeta[depth]));
delete &nodes;
}
delete _root;
}
void SoftmaxTree::setFwdMeta() {
for (map<int, SoftmaxNodeV*>::iterator it = _nodeDepths.begin(); it != _nodeDepths.end(); ++it) {
SoftmaxNodeV& nodes = *it->second;
ushort2* meta;
checkCudaErrors(cudaHostAlloc(&meta, sizeof(ushort2) * nodes.size(), cudaHostAllocPortable));
int depth = it->first;
_nodeFwdMeta[depth] = meta;
for (int n = 0; n < nodes.size(); n++) {
meta[n].x = nodes[n]->getLabel();
// Setting the root to have parent 0 is ok because the fwd kernel won't
// query this anyway when root == true.
meta[n].y = nodes[n]->getParent() == NULL ? 0 : nodes[n]->getParent()->getLabel();
}
}
}
void SoftmaxTree::setBwdMeta() {
for (map<int, SoftmaxNodeV*>::iterator it = _nodeHeights.begin(); it != _nodeHeights.end(); ++it) {
SoftmaxNodeV& nodes = *it->second;
ushort2* meta;
ushort** childMeta;
checkCudaErrors(cudaHostAlloc(&meta, sizeof(ushort2) * nodes.size(), cudaHostAllocPortable));
checkCudaErrors(cudaHostAlloc(&childMeta, sizeof(ushort*) * nodes.size(), cudaHostAllocPortable));
int height = it->first;
_nodeBwdMeta[height] = meta;
_nodeChildMeta[height] = childMeta;
for (int n = 0; n < nodes.size(); n++) {
checkCudaErrors(cudaHostAlloc(&childMeta[n], sizeof(ushort) * nodes[n]->getChildren().size(), cudaHostAllocPortable));
for (int c = 0; c < nodes[n]->getChildren().size(); c++) {
childMeta[n][c] = nodes[n]->getChildren()[c]->getLabel();
}
meta[n].x = nodes[n]->getLabel();
meta[n].y = nodes[n]->getChildren().size();
}
}
}
void SoftmaxTree::setDistances() {
_nodeHeights.clear();
_nodeDepths.clear();
_root->setDistances(_nodeHeights, _nodeDepths);
}
void SoftmaxTree::setNodeCounts() {
_numNodes = 0;
_numLeaves = 0;
_root->setNodeCounts(_numNodes, _numLeaves);
}
void SoftmaxTree::setNodeSizes() {
assert(_numLeaves > 0);
checkCudaErrors(cudaHostAlloc(&_nodeSizes, sizeof(ushort) * _numNodes, cudaHostAllocPortable));
_root->setSizes(_nodeSizes);
}
void SoftmaxTree::finalize() {
setDistances();
setNodeCounts();
setNodeSizes();
setFwdMeta();
setBwdMeta();
}
SoftmaxNode& SoftmaxTree::getRoot() {
return *_root;
}
SoftmaxNodeV& SoftmaxTree::getNodesAtHeight(int height) {
return *_nodeHeights[height];
}
SoftmaxNodeV& SoftmaxTree::getNodesAtDepth(int depth) {
return *_nodeDepths[depth];
}
int SoftmaxTree::getHeight() const {
return _root->getHeight();
}
/*
* A tree with only a root is taken to have depth 0.
*/
int SoftmaxTree::getDepth() const {
return _nodeDepths.size() - 1;
}
int SoftmaxTree::getNumLeaves() const {
return _numLeaves;
}
int SoftmaxTree::getNumNodes() const {
return _numNodes;
}
/*
* offsets: (numNodes, numFeatures)
* targets: (numNodes, numFeatures)
*/
void SoftmaxTree::makeWeights(NVMatrix& offsets, NVMatrix& targets) {
preprocess(offsets);
preprocess(targets);
assert(offsets.getNumRows() == _numNodes);
assert(targets.isSameDims(offsets));
int numFeatures = offsets.getNumCols();
dim3 threads = dim3(256); // 256 seems to work best on dummy binary tree
dim3 blocks = dim3(DIVUP(numFeatures, 256), 1); // Only the root is at depth 0
cudaFuncSetCacheConfig(kSoftmaxTreeFwd<256, true>, cudaFuncCachePreferL1);
cudaFuncSetCacheConfig(kSoftmaxTreeFwd<256, false>, cudaFuncCachePreferL1);
kSoftmaxTreeFwd<256, true><<<blocks, threads>>>(offsets.getDevData(), _nodeFwdMeta[0], targets.getDevData(), numFeatures);
getLastCudaError("kSoftmaxTreeFwd: kernel execution failed");
for (int d = 1; d <= getDepth(); d++) {
blocks = dim3(DIVUP(numFeatures, 256), _nodeDepths[d]->size());
kSoftmaxTreeFwd<256, false><<<blocks, threads>>>(offsets.getDevData(), _nodeFwdMeta[d], targets.getDevData(), numFeatures);
getLastCudaError("kSoftmaxTreeFwd: kernel execution failed");
}
postprocess(offsets);
postprocess(targets);
}
/*
* grads: (numNodes, numFeatures)
*
* The idea is that grads contains gradients for the leaves
* (i.e. the first numLabels rows), so this routine will
* distribute them up the tree.
*
*/
void SoftmaxTree::distributeGradients(NVMatrix& grads) {
preprocess(grads);
assert(grads.getNumRows() == _numNodes);
int numFeatures = grads.getNumCols();
// The leaves (nodes at height = 0) already have gradients computed.
// So start at the nodes at height = 1.
dim3 threads = dim3(512); // this block size works best :/
cudaFuncSetCacheConfig(kSoftmaxTreeBwd<512>, cudaFuncCachePreferL1);
for (int h = 1; h <= getHeight(); ++h) {
dim3 blocks = dim3(DIVUP(numFeatures, 512), _nodeHeights[h]->size());
kSoftmaxTreeBwd<512><<<blocks, threads>>>(grads.getDevData(), _nodeBwdMeta[h], _nodeChildMeta[h], numFeatures);
getLastCudaError("kSoftmaxTreeBwd: kernel execution failed");
}
postprocess(grads);
}
/*
* inc := mom * inc - wc * epsW * weight + epsW * grad
* weight := weight + inc
*
* weights: (numNodes, numFeatures)
* incs: (numNodes, numFeatures)
* grads: (numNodes , numFeatures)
*/
void SoftmaxTree::updateWeights(NVMatrix& weights, NVMatrix& incs, NVMatrix& grads, float epsWBase, float mom, float wcBase) {
preprocess(weights);
preprocess(incs);
preprocess(grads);
assert(grads.getNumRows() == _numNodes);
assert(grads.isSameDims(incs));
assert(grads.isSameDims(weights));
int numFeatures = grads.getNumCols();
dim3 threads = dim3(512);
dim3 blocks = dim3(_numNodes);
cudaFuncSetCacheConfig(kSoftmaxTreeUpdateWeights<512>, cudaFuncCachePreferL1);
kSoftmaxTreeUpdateWeights<512><<<blocks, threads>>>(weights.getDevData(), incs.getDevData(), grads.getDevData(),
_nodeSizes, numFeatures, epsWBase, mom, wcBase);
getLastCudaError("kSoftmaxTreeUpdateWeights: kernel execution failed");
weights.transpose();
incs.transpose();
grads.transpose();
}
void SoftmaxTree::preprocess(NVMatrix& inp) {
inp.transpose();
assert(!inp.isTrans());
assert(inp.isContiguous());
}
void SoftmaxTree::postprocess(NVMatrix& inp) {
inp.transpose();
}

378
src/test.cu Normal file
View file

@ -0,0 +1,378 @@
#include <iostream>
#include <stdlib.h>
#include <vector>
#include <set>
#include <test.cuh>
#include <layer_kernels.cuh>
#include <multisoftmax.h>
#include <cpuCNN.cuh>
static StopWatchInterface *timer = NULL;
using namespace std;
void init_tests(int boardNum) {
cudaSetDevice(boardNum > -1 ? boardNum : 0);
// cublasInit();
NVMatrix::initCublas();
NVMatrix::initRandom(7);
sdkCreateTimer(&timer);
}
void compareResults(Matrix& cpu, NVMatrix& gpu, const char* matrixName) {
Matrix gpuOnCPU(cpu);
gpu.copyToHost(gpuOnCPU);
gpuOnCPU.subtract(cpu);
gpuOnCPU.apply(Matrix::ABS);
printf("Max diff between CPU/GPU matrices %s: %.6f\n", matrixName, gpuOnCPU.max());
}
void test_blattice() {
printf("===============================\n");
printf("test_blattice\n");
printf("===============================\n");
int numCases = 2;
int numOut = 32;
int setSize = 3;
cout << "numCases: " << numCases << endl;
cout << "numOut: " << numOut << endl;
cout << "setSize: " << setSize << endl;
NVMatrix nvEnergies(numCases, numOut);
Matrix energies(numCases, numOut);
Matrix bLattice(numOut, numCases * setSize);
nvEnergies.randomizeUniform();
nvEnergies.copyToHost(energies);
//energies.randomizeUniform();
bLattice.apply(Matrix::ZERO); // for now
Matrix &enMax = energies.max(1);
energies.addVector(enMax, -1);
nvEnergies.copyFromHost(energies);
NVMatrix nvBLattice(bLattice, true);
sdkResetTimer(&timer);
sdkStartTimer(&timer);
MSMBackward(nvEnergies, nvBLattice, setSize);
cudaThreadSynchronize();
sdkStopTimer(&timer);
printf("Energies: \n");
nvEnergies.print(10, 5);
printf("GPU (partial) result:\n");
nvBLattice.print(0, 5, 0, 5);
printf("GPU time: %.6f msec\n", sdkGetTimerValue(&timer));
}
//void test_multiSoftmaxCPU() {
// printf("===============================\n");
// printf("test_multiSoftmaxCPU\n");
// printf("===============================\n");
//
// int numCases = 2;
// int numOut = 5;
// int setSize = 3;
//
//// int numCases = 128;
//// int numOut = 1000;
//// int setSize = 5;
//
// cout << "numCases: " << numCases << endl;
// cout << "numOut: " << numOut << endl;
// cout << "setSize: " << setSize << endl;
//
// Matrix energies(numCases, numOut);
// Matrix B(numOut + 1, setSize + 1);
// Matrix probs(energies);
// energies.randomizeUniform();
// probs.apply(Matrix::ZERO); // for now
//
// Matrix &enMax = energies.max(1);
// energies.addVector(enMax, -1);
// B.apply(Matrix::ZERO);
//
// sdkResetTimer(&timer);
// sdkStartTimer(&timer);
//
// MultiSoftmaxCPU_T(energies, B, probs, setSize, -1);
//
// cudaThreadSynchronize();
// sdkStopTimer(&timer);
//
// printf("Energies: \n");
// energies.print(10, 5);
//
// printf("CPU (partial) result:\n");
// probs.print(0, 5, 0, 5);
// printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
//}
void test_multiSoftmaxCPU_parallel() {
printf("===============================\n");
printf("test_multiSoftmaxCPU_parallel\n");
printf("===============================\n");
int workers = 8;
int numCases = 2;
int numOut = 5;
int setSize = 2;
// int numCases = 128;
// int numOut = 1000;
// int setSize = 5;
cout << "workers: " << workers << endl;
cout << "numCases: " << numCases << endl;
cout << "numOut: " << numOut << endl;
cout << "setSize: " << setSize << endl;
NVMatrix nvEnergies(numCases, numOut);
Matrix energies(numCases, numOut);
vector<Matrix*> B;
Matrix probs(energies);
Matrix fixed(numCases, 1);
nvEnergies.randomizeUniform();
nvEnergies.copyToHost(energies);
//energies.randomizeUniform();
probs.apply(Matrix::ZERO); // for now
Matrix &enMax = energies.max(1);
energies.addVector(enMax, -1);
fixed.apply(Matrix::ONE);
fixed.scale(2);
for (int i = 0; i < workers; i++) {
B.push_back(new Matrix(numOut + 1, setSize + 1));
B[i]->apply(Matrix::ONE);
B[i]->scale(-INF);
}
sdkResetTimer(&timer);
sdkStartTimer(&timer);
MultiSoftmaxCPU_T_parallel(energies, B, probs, fixed, setSize, true);
cudaThreadSynchronize();
sdkStopTimer(&timer);
printf("Energies: \n");
energies.print(10, 10);
printf("CPU (partial) result:\n");
probs.print(0, 5, 0, 10);
printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
}
SoftmaxTree* makeDummyTree(int depth) {
int numNodes = (1 << (depth + 1)) - 1;
int numLeaves = (numNodes + 1) / 2;
int idx = numNodes - 1;
SoftmaxTree* tree = new SoftmaxTree(idx--);
vector<SoftmaxNode*> prevLevel;
prevLevel.push_back(&tree->getRoot());
while (idx >= 0) {
int sz = prevLevel.size();
for (int i = 0; i < sz; i++) {
SoftmaxNode& node = *prevLevel[0];
SoftmaxNode& child1 = node.addChild(idx--);
SoftmaxNode& child2 = node.addChild(idx--);
prevLevel.push_back(&child1);
prevLevel.push_back(&child2);
prevLevel.erase(prevLevel.begin());
}
}
tree->finalize();
assert(tree->getNumLeaves() == numLeaves);
assert(tree->getNumNodes() == numNodes);
return tree;
}
void test_sftree_fwd() {
printf("===============================\n");
printf("test_sftree_fwd\n");
printf("===============================\n");
int numFeatures = 6*6*128;
int depth = 10;
SoftmaxTree* tree = makeDummyTree(depth);
cout << "numFeatures: " << numFeatures << endl;
cout << "depth: " << depth << endl;
cout << "numNodes: " << tree->getNumNodes() << endl;
cout << "numLabels: " << tree->getNumLeaves() << endl;
Matrix weights(tree->getNumNodes(), numFeatures);
Matrix targets(tree->getNumNodes(), numFeatures);
NVMatrix nvWeights(tree->getNumNodes(), numFeatures);
NVMatrix nvTargets(tree->getNumNodes(), numFeatures);
weights.randomizeUniform();
nvWeights.copyFromHost(weights);
sdkResetTimer(&timer);
sdkStartTimer(&timer);
cpuSoftmaxTreeFwd(weights.getData(), targets.getData(), numFeatures, *tree);
sdkStopTimer(&timer);
printf("CPU (partial) result:\n");
targets.print(0, 7, 0, 5);
printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
sdkResetTimer(&timer);
cudaDeviceSynchronize();
nvWeights.transpose();
nvTargets.transpose();
sdkStartTimer(&timer);
tree->makeWeights(nvWeights, nvTargets);
cudaDeviceSynchronize();
sdkStopTimer(&timer);
nvWeights.transpose();
nvTargets.transpose();
printf("GPU (partial) result:\n");
nvTargets.print(0, 7, 0, 5);
printf("GPU time: %.6f msec\n", sdkGetTimerValue(&timer));
compareResults(targets, nvTargets, "targets");
}
void test_sftree_bwd() {
printf("===============================\n");
printf("test_sftree_bwd\n");
printf("===============================\n");
int numFeatures = 6*6*128;
int depth = 10;
SoftmaxTree* tree = makeDummyTree(depth);
cout << "numFeatures: " << numFeatures << endl;
cout << "depth: " << depth << endl;
cout << "numNodes: " << tree->getNumNodes() << endl;
cout << "numLabels: " << tree->getNumLeaves() << endl;
Matrix grads(tree->getNumNodes(), numFeatures);
NVMatrix nvGrads(tree->getNumNodes(), numFeatures);
grads.randomizeUniform();
nvGrads.copyFromHost(grads);
sdkResetTimer(&timer);
sdkStartTimer(&timer);
cpuSoftmaxTreeBwd(grads.getData(), numFeatures, *tree);
sdkStopTimer(&timer);
printf("CPU (partial) result:\n");
grads.print(0, 7, 0, 5);
printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
sdkResetTimer(&timer);
cudaDeviceSynchronize();
nvGrads.transpose();
sdkStartTimer(&timer);
tree->distributeGradients(nvGrads);
cudaDeviceSynchronize();
sdkStopTimer(&timer);
nvGrads.transpose();
printf("GPU (partial) result:\n");
nvGrads.print(0, 7, 0, 5);
printf("GPU time: %.6f msec\n", sdkGetTimerValue(&timer));
compareResults(grads, nvGrads, "grads");
}
void test_sftree_update() {
printf("===============================\n");
printf("test_sftree_update\n");
printf("===============================\n");
float eps = 0.001, wc = 0.005, mom = 0.9;
int numFeatures = 6*6*128;
int depth = 10;
SoftmaxTree* tree = makeDummyTree(depth);
cout << "numFeatures: " << numFeatures << endl;
cout << "depth: " << depth << endl;
cout << "numNodes: " << tree->getNumNodes() << endl;
cout << "numLabels: " << tree->getNumLeaves() << endl;
Matrix grads(tree->getNumNodes(), numFeatures);
Matrix weights(tree->getNumNodes(), numFeatures);
Matrix incs(tree->getNumNodes(), numFeatures);
NVMatrix nvGrads(tree->getNumNodes(), numFeatures);
NVMatrix nvWeights(tree->getNumNodes(), numFeatures);
NVMatrix nvIncs(tree->getNumNodes(), numFeatures);
grads.randomizeUniform();
weights.randomizeUniform();
incs.randomizeUniform();
nvGrads.copyFromHost(grads);
nvWeights.copyFromHost(weights);
nvIncs.copyFromHost(incs);
sdkResetTimer(&timer);
sdkStartTimer(&timer);
cpuSoftmaxTreeUpdateWeights(weights.getData(), incs.getData(), grads.getData(), numFeatures, eps, mom, wc, *tree);
sdkStopTimer(&timer);
printf("CPU (partial) result:\n");
weights.print(0, 7, 0, 5);
printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
sdkResetTimer(&timer);
cudaDeviceSynchronize();
nvGrads.transpose();
nvWeights.transpose();
nvIncs.transpose();
sdkStartTimer(&timer);
tree->updateWeights(nvWeights, nvIncs, nvGrads, eps, mom, wc);
cudaDeviceSynchronize();
sdkStopTimer(&timer);
nvGrads.transpose();
nvWeights.transpose();
nvIncs.transpose();
printf("GPU (partial) result:\n");
nvWeights.print(0, 7, 0, 5);
printf("GPU time: %.6f msec\n", sdkGetTimerValue(&timer));
compareResults(weights, nvWeights, "weights");
compareResults(incs, nvIncs, "incs");
}
int main(int argc, char** argv) {
int boardNum = get_board_lock();
if (boardNum == GPU_LOCK_NO_BOARD) {
printf("No free GPU boards!\n");
exit(EXIT_FAILURE);
} else if(boardNum == GPU_LOCK_NO_SCRIPT) {
printf("Running on default board.\n");
} else {
printf("Running on board %d\n", boardNum);
}
init_tests(boardNum);
// test_blattice();
// test_multiSoftmaxCPU();
// test_multiSoftmaxCPU_parallel();
// test_sftree_fwd();
// test_sftree_bwd();
// test_mdiag();
// test_mdiagGrad();
return 0;
}

124
src/util.cu Normal file
View file

@ -0,0 +1,124 @@
/*
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <util.cuh>
using namespace std;
stringv* getStringV(PyObject* pyList) {
if (pyList == NULL) {
return NULL;
}
stringv* vec = new stringv();
for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
vec->push_back(string(PyString_AS_STRING(PyList_GET_ITEM(pyList, i))));
}
return vec;
}
floatv* getFloatV(PyObject* pyList) {
if (pyList == NULL) {
return NULL;
}
floatv* vec = new floatv();
for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
vec->push_back(PyFloat_AS_DOUBLE(PyList_GET_ITEM(pyList, i)));
}
return vec;
}
intv* getIntV(PyObject* pyList) {
if (pyList == NULL) {
return NULL;
}
intv* vec = new intv();
for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
vec->push_back(PyInt_AS_LONG(PyList_GET_ITEM(pyList, i)));
}
return vec;
}
int* getIntA(PyObject* pyList) {
if (pyList == NULL) {
return NULL;
}
int* arr = new int[PyList_GET_SIZE(pyList)];
for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
arr[i] = PyInt_AS_LONG(PyList_GET_ITEM(pyList, i));
}
return arr;
}
MatrixV* getMatrixV(PyObject* pyList) {
return getMatrixV(pyList, PyList_GET_SIZE(pyList));
}
MatrixV* getMatrixV(PyObject* pyList, int len) {
if (pyList == NULL) {
return NULL;
}
MatrixV* vec = new MatrixV();
for (int i = 0; i < len; i++) {
vec->push_back(new Matrix((PyArrayObject*)PyList_GET_ITEM(pyList, i)));
}
return vec;
}
int pyDictGetInt(PyObject* dict, const char* key) {
return PyInt_AS_LONG(PyDict_GetItemString(dict, key));
}
intv* pyDictGetIntV(PyObject* dict, const char* key) {
return getIntV(PyDict_GetItemString(dict, key));
}
int* pyDictGetIntA(PyObject* dict, const char* key) {
return getIntA(PyDict_GetItemString(dict, key));
}
string pyDictGetString(PyObject* dict, const char* key) {
return string(PyString_AS_STRING(PyDict_GetItemString(dict, key)));
}
float pyDictGetFloat(PyObject* dict, const char* key) {
return PyFloat_AS_DOUBLE(PyDict_GetItemString(dict, key));
}
floatv* pyDictGetFloatV(PyObject* dict, const char* key) {
return getFloatV(PyDict_GetItemString(dict, key));
}
Matrix* pyDictGetMatrix(PyObject* dict, const char* key) {
return new Matrix((PyArrayObject*)PyDict_GetItemString(dict, key));
}
MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key) {
return getMatrixV(PyDict_GetItemString(dict, key));
}
stringv* pyDictGetStringV(PyObject* dict, const char* key) {
return getStringV(PyDict_GetItemString(dict, key));
}

378
src/weights.cu Normal file
View file

@ -0,0 +1,378 @@
/*
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <algorithm>
#include <weights.cuh>
#include <softmaxtree.cuh>
#include <lr.cuh>
#include "worker.cuh"
NVMatrix& Weights::operator*() const {
return getW();
}
Weights::Weights(Weights& srcWeights, LearningRateSchedule& lrs)
: _srcWeights(&srcWeights), _lrs(&lrs), _wc(0), _wball(0), _onGPU(false), _numUpdates(0),
_weights(NULL), _weightsInc(NULL), _weightsGrad(NULL), _cleanup(false) {
_hWeights = &srcWeights.getCPUW();
_hWeightsInc = &srcWeights.getCPUWInc();
_mom = srcWeights.getMom();
_useGrad = srcWeights.isUseGrad();
_superEps = srcWeights.getSuperEps();
}
Weights::Weights(Matrix& hWeights, Matrix& hWeightsInc, LearningRateSchedule& lrs, float wc,
float wball, float mom, float superEps, bool useGrad, bool cleanup)
: _srcWeights(NULL), _hWeights(&hWeights), _hWeightsInc(&hWeightsInc), _numUpdates(0),
_lrs(&lrs), _wc(wc), _wball(wball), _mom(mom), _useGrad(useGrad), _superEps(superEps),
_onGPU(false), _weights(NULL),_weightsInc(NULL), _weightsGrad(NULL), _cleanup(cleanup) {
assert(_superEps <= 0 || _useGrad); // superWeights ==> useGrad
}
Weights::~Weights() {
delete _lrs;
if (_cleanup) {
delete _hWeights;
delete _hWeightsInc;
if (_srcWeights == NULL) {
delete _weights;
delete _weightsInc;
delete _weightsGrad;
}
}
}
NVMatrix& Weights::getW() const {
assert(_onGPU);
return *_weights;
}
NVMatrix& Weights::getInc() const {
assert(_onGPU);
return *_weightsInc;
}
NVMatrix& Weights::getGrad() const {
assert(_onGPU);
return _useGrad ? *_weightsGrad : *_weightsInc;
}
Matrix& Weights::getCPUW() const {
return *_hWeights;
}
Matrix& Weights::getCPUWInc() const {
return *_hWeightsInc;
}
int Weights::getNumRows() const {
return _hWeights->getNumRows();
}
int Weights::getNumCols() const {
return _hWeights->getNumCols();
}
void Weights::copyToCPU() {
if (_srcWeights == NULL) {
assert(_onGPU);
_weights->copyToHost(*_hWeights);
_weightsInc->copyToHost(*_hWeightsInc);
}
}
// This function is assumed to be called in the order in which the layers
// were defined
void Weights::copyToGPU() {
assert(!_onGPU);
if (_srcWeights == NULL) {
_weights = _weights == NULL ? new NVMatrix() : _weights;
_weightsInc = _weightsInc == NULL ? new NVMatrix() : _weightsInc;
_weights->copyFromHost(*_hWeights, true);
_weightsInc->copyFromHost(*_hWeightsInc, true);
_weightsGrad = _useGrad ? (_weightsGrad == NULL ? new NVMatrix(*_weights) : _weightsGrad) : NULL;
_weightsGradAvg = _superEps > 0 ? new NVMatrix() : NULL;
_weightsGrad2Avg = _superEps > 0 ? new NVMatrix() : NULL;
} else {
_weights = _srcWeights->_weights;
_weightsInc = _srcWeights->_weightsInc;
_weightsGrad = _srcWeights->_weightsGrad;
}
_onGPU = true;
}
#define SUPERMOM_THREADS 256
#define SUPERMOM_BLOCKS_MAX 4096
/*
* V = eps * g / (G2 - G^2 + superEps)^.5 + mom * V
*/
__global__ void superMomUpdate(float* V, float* g, float* G, float* G2,
const float eps, const float mom, const float superEps, const int numElements) {
const int tidx = blockIdx.x * SUPERMOM_THREADS + threadIdx.x;
for (int t = tidx; t < numElements; t += gridDim.x * SUPERMOM_THREADS) {
V[t] = /*mom*/0.9 * V[t] + eps * __fdividef(g[t], sqrtf(G2[t] - G[t] + superEps));
}
}
// When _useGrad is false, weightsInc is assumed to contain the
// entire, properly scaled weight increment.
// OTHERWISE, scale your gradient by 1 / numCases only.
// The scaling by epsW will be done in this routine.
void Weights::update(float progress) {
// Only true owner of weights updates
if (_srcWeights == NULL && _lrs->getBaseRate() > 0) {
assert(_onGPU);
if (_superEps <= 0) {
if (_useGrad) {
_weightsInc->add(*_weightsGrad, _mom, _lrs->getRate(progress));
}
} else {
if (!_weightsGradAvg->isSameDims(*_weightsGrad)) {
_weightsGradAvg->resize(*_weightsGrad);
_weightsGrad2Avg->resize(*_weightsGrad);
_weightsGradAvg->apply(NVMatrixOps::Zero());
_weightsGrad2Avg->apply(NVMatrixOps::Zero());
}
_weightsGradAvg->add(*_weightsGrad, _mom, 1 - _mom);
_weightsGrad2Avg->applyBinary(Grad2AvgOperator(_mom), *_weightsGrad);
// Geoff version
// Make sure all matrices are contiguous
assert(_weightsGrad->isContiguous());
assert(_weightsGradAvg->isContiguous());
assert(_weightsGrad2Avg->isContiguous());
assert(_weightsInc->isContiguous());
// Make sure they all have the same transposedness
assert(_weightsGrad->isTrans() == _weightsGradAvg->isTrans());
assert(_weightsGradAvg->isTrans() == _weightsGrad2Avg->isTrans());
assert(_weightsGrad2Avg->isTrans() == _weightsInc->isTrans());
// Make sure they all have the same sizes
assert(_weightsGrad->isSameDims(*_weightsGradAvg));
assert(_weightsGradAvg->isSameDims(*_weightsGrad2Avg));
assert(_weightsGrad2Avg->isSameDims(*_weightsInc));
int numElements = _weights->getNumElements();
dim3 blocks(std::min(DIVUP(numElements, SUPERMOM_THREADS), SUPERMOM_BLOCKS_MAX));
dim3 threads(SUPERMOM_THREADS);
//float super = _superEps + 1000000*_weightsGrad2Avg->sum() / numElements;
//printf("super: %f\n", super);
superMomUpdate<<<blocks, threads>>>(_weightsInc->getDevData(), _weightsGrad->getDevData(),
_weightsGradAvg->getDevData(), _weightsGrad2Avg->getDevData(),
_lrs->getRate(progress), _mom, _superEps, numElements);
getLastCudaError("superMomUpdate: Kernel execution failed");
//_weightsInc->print(4,4);
//_weightsGrad2Avg->print(5,5);exit(0);
// Ilya version
}
if (_wc > 0) {
_weightsInc->add(*_weights, -_wc * _lrs->getRate(progress));
}
_weights->add(*_weightsInc);
_numUpdates = 0;
}
}
int Weights::incNumUpdates() {
if (_srcWeights != NULL) {
return _srcWeights->incNumUpdates();
}
return _numUpdates++;
}
// Returns the number of times a gradient has been computed for this
// weight matrix during the current pass (interval between two calls of update())
// through the net. This number will only be greater than 1 if this weight matrix
// is *shared* by multiple layers in the net.
int Weights::getNumUpdates() const {
if (_srcWeights != NULL) {
return _srcWeights->getNumUpdates();
}
return _numUpdates;
}
float Weights::getEps(float progress) const {
return _lrs->getRate(progress);
}
float Weights::getMom() const {
return _mom;
}
float Weights::getWC() const {
return _wc;
}
float Weights::getWBall() const {
return _wball;
}
bool Weights::isUseGrad() const { // is good grammar
return _useGrad;
}
bool Weights::isOwner() const {
return _srcWeights == NULL;
}
float Weights::getSuperEps() const {
return _superEps;
}
LearningRateSchedule& Weights::getLearningRateSchedule() const {
return *_lrs;
}
/*
* ===============
* TreeWeights
* ===============
*/
TreeWeights::TreeWeights(SoftmaxTree& tree, Matrix& hWeights, Matrix& hWeightsInc, LearningRateSchedule& lrs, float wcBase, float mom)
: _tree(&tree), Weights(hWeights, hWeightsInc, lrs, wcBase, 0, mom, 0, true) {
assert(hWeights.isTrans());
assert(hWeightsInc.isTrans());
}
NVMatrix& TreeWeights::getW() const {
return *_leafWeights;
}
NVMatrix& TreeWeights::getInc() const {
return *_leafInc;
}
NVMatrix& TreeWeights::getGrad() const {
return *_leafGrad;
}
NVMatrix& TreeWeights::getAllW() const {
return *_weights;
}
NVMatrix& TreeWeights::getAllInc() const {
return *_weightsInc;
}
NVMatrix& TreeWeights::getAllGrad() const {
return *_weightsGrad;
}
void TreeWeights::copyToGPU() {
assert(!_onGPU);
Weights::copyToGPU();
_tree->finalize();
_effWeights.resize(*_weights);
_leafWeights = &_effWeights.sliceCols(0, _tree->getNumLeaves());
_leafGrad = &_weightsGrad->sliceCols(0, _tree->getNumLeaves());
_leafInc = &_weightsInc->sliceCols(0, _tree->getNumLeaves());
assert(_leafWeights->isView());
makeWeights();
}
int TreeWeights::getNumRows() const {
return _tree->getNumNodes();
}
void TreeWeights::update(float progress) {
// Only true owner of weights updates
if (_lrs->getBaseRate() > 0) {
assert(_onGPU);
distributeGradients();
_tree->updateWeights(*_weights, *_weightsInc, *_weightsGrad, _lrs->getRate(progress), _mom, _wc);
makeWeights();
_numUpdates = 0;
}
}
void TreeWeights::makeWeights() {
_tree->makeWeights(*_weights, _effWeights);
}
void TreeWeights::distributeGradients() {
_tree->distributeGradients(*_weightsGrad);
}
/*
* ===============
* DummyWeights
* ===============
*/
DummyWeights::DummyWeights(Matrix& hWeights, Matrix& hWeightsInc,
NVMatrix& weights, NVMatrix& incs, NVMatrix& grads)
: Weights(hWeights, hWeightsInc, *new LearningRateSchedule(0), 0, 0, 0, 0, true, false) {
_onGPU = true;
_weights = &weights;
_weightsInc = &incs;
_weightsGrad = &grads;
}
/*
* ===============
* WeightList
* ===============
*/
Weights& WeightList::operator[](const int idx) const {
return *_weightList[idx];
}
WeightList::~WeightList() {
for (int i = 0; i < _weightList.size(); i++) {
delete _weightList[i];
}
}
WeightList::WeightList() {
}
void WeightList::addWeights(Weights& w) {
_weightList.push_back(&w);
}
void WeightList::update(float progress) {
for (int i = 0; i < getSize(); i++) {
_weightList[i]->update(progress);
}
}
void WeightList::copyToCPU() {
for (int i = 0; i < getSize(); i++) {
_weightList[i]->copyToCPU();
}
}
void WeightList::copyToGPU() {
for (int i = 0; i < getSize(); i++) {
_weightList[i]->copyToGPU();
}
}
int WeightList::getSize() const {
return _weightList.size();
}

279
src/worker.cu Normal file
View file

@ -0,0 +1,279 @@
/*
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <algorithm>
#include <util.cuh>
#include <worker.cuh>
using namespace std;
/*
* ====================
* WorkResult
* ====================
*/
WorkResult::WorkResult(WorkResult::RESULTS resultType, Cost& results) : _resultType(resultType), _results(&results) {
}
WorkResult::WorkResult(WorkResult::RESULTS resultType) : _resultType(resultType), _results(NULL) {
}
WorkResult::~WorkResult() {
delete _results; // delete NULL is ok
}
Cost& WorkResult::getResults() const {
return *_results;
}
WorkResult::RESULTS WorkResult::getResultType() const {
return _resultType;
}
/*
* ====================
* Worker
* ====================
*/
Worker::Worker(ConvNet& convNet) : _convNet(&convNet) {
}
/*
* ====================
* DataWorker
* ====================
*/
DataWorker::DataWorker(ConvNet& convNet, CPUData& data) : Worker(convNet), _data(&data) {
_dp = &convNet.getDataProvider();
_dp->setData(*_data);
}
DataWorker::~DataWorker() {
_dp->clearData();
}
/*
* ====================
* TrainingWorker
* ====================
*/
TrainingWorker::TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test)
: DataWorker(convNet, data), _progress(progress), _test(test) {
}
// Need to setData here (as opposed to the constructor) because the constructor executes in
// the original CPU thread, which is not the one with GPU access.
void TrainingWorker::run() {
_convNet->setTrainingProgress(_progress);
Cost& batchCost = *new Cost(0);
for (int i = 0; i < _dp->getNumMinibatches(); i++) {
_convNet->fprop(i, _test ? PASS_TEST : PASS_TRAIN);
_convNet->getCost(batchCost);
if (!_test) {
_convNet->bprop(PASS_TRAIN);
_convNet->updateWeights();
}
}
_convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
}
/*
* ====================
* SyncWorker
* ====================
*/
SyncWorker::SyncWorker(ConvNet& convNet) : Worker(convNet) {
}
void SyncWorker::run() {
_convNet->copyToCPU();
_convNet->getResultQueue().enqueue(new WorkResult(WorkResult::SYNC_DONE));
}
/*
* ====================
* GradCheckWorker
* ====================
*/
GradCheckWorker::GradCheckWorker(ConvNet& convNet, CPUData& data)
: DataWorker(convNet, data) {
}
void GradCheckWorker::run() {
_convNet->checkGradients();
exit(0);
}
/*
* ====================
* MultiviewTestWorker
* ====================
*/
MultiviewTestWorker::MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* logregName)
: DataWorker(convNet, data), _numViews(numViews), _cpuProbs(&cpuProbs), _logregName(logregName) {
assert(_data->getNumCases() % _numViews == 0);
}
MultiviewTestWorker::MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews)
: DataWorker(convNet, data), _numViews(numViews), _cpuProbs(NULL), _logregName("") {
assert(_data->getNumCases() % _numViews == 0);
}
MultiviewTestWorker::~MultiviewTestWorker() {
delete _cpuProbs;
}
void MultiviewTestWorker::run() {
int numCasesReal = _dp->getNumCases() / _numViews;
int numMiniReal = DIVUP(numCasesReal, _dp->getMinibatchSize());
Cost& batchCost = *new Cost(0);
for (int i = 0; i < numMiniReal; i++) {
for (int v = 0; v < _numViews; v++) {
CPUData& mini = _dp->getDataSlice(v * numCasesReal + i * _dp->getMinibatchSize(),
min((v + 1) * numCasesReal, v * numCasesReal + (i + 1) * _dp->getMinibatchSize()));
_convNet->fprop(mini, v == 0 ? PASS_MULTIVIEW_TEST_START : v == _numViews - 1 ? PASS_MULTIVIEW_TEST_END : PASS_MULTIVIEW_TEST);
}
if (_cpuProbs != NULL) {
LogregCostLayer& logregLayer = *dynamic_cast<LogregCostLayer*>(&_convNet->getLayer(_logregName));
cudaSetDevice(logregLayer.getDeviceID());
Matrix& miniProbs = _cpuProbs->sliceRows(i * _dp->getMinibatchSize(),
min(numCasesReal, (i + 1) * _dp->getMinibatchSize()));
NVMatrix& acts = logregLayer.getProbsAccum();
NVMatrix acts_T;
acts.transpose(acts_T);
acts_T.copyToHost(miniProbs);
delete &miniProbs;
}
_convNet->getCost(batchCost);
}
cudaDeviceSynchronize();
_convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
}
/*
* ====================
* FeatureWorker
* ====================
*/
FeatureWorker::FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames)
: DataWorker(convNet, data), _ftrs(&ftrs), _layerNames(&layerNames) {
assert(layerNames.size() == ftrs.size());
for (int i = 0; i < layerNames.size(); i++) {
assert(ftrs[i]->getNumRows() == data.getNumCases());
assert(!ftrs[i]->isTrans());
}
}
FeatureWorker::~FeatureWorker() {
for (int i = 0; i < _ftrs->size(); i++) {
delete _ftrs->at(i);
}
delete _ftrs;
delete _layerNames;
}
void FeatureWorker::run() {
Cost& batchCost = *new Cost(0);
for (int i = 0; i < _dp->getNumMinibatches(); i++) {
_convNet->fprop(i, PASS_FEATURE_GEN);
_convNet->getCost(batchCost);
for (int f = 0; f < _layerNames->size(); f++) {
Layer& ftrLayer = _convNet->getLayer(_layerNames->at(f));
int d = ftrLayer.getDeviceID();
cudaSetDevice(d);
Matrix& miniFtrs = _ftrs->at(f)->sliceRows(i * _dp->getMinibatchSize(),
min(_dp->getNumCases(), (i + 1) * _dp->getMinibatchSize()));
NVMatrix& acts = ftrLayer.getActs();
NVMatrix acts_T;
if (acts.isTrans()) {
NVMatrix& soft_T = acts.getTranspose();
soft_T.transpose(acts_T);
delete &soft_T;
} else {
acts.transpose(acts_T);
}
acts_T.copyToHost(miniFtrs);
delete &miniFtrs;
}
}
cudaDeviceSynchronize();
_convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
}
/*
* ====================
* DataGradWorker
* ====================
*/
DataGradWorker::DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx)
: DataWorker(convNet, data), _dataGrads(&dataGrads), _dataLayerIdx(dataLayerIdx), _softmaxLayerIdx(softmaxLayerIdx) {
assert(dataGrads.getNumRows() == data.getNumCases());
assert(!dataGrads.isTrans());
}
DataGradWorker::~DataGradWorker() {
delete _dataGrads;
}
void DataGradWorker::run() {
// DataLayer& dataLayer = *dynamic_cast<DataLayer*>(&_convNet->getLayer(_dataLayerIdx));
// SoftmaxLayer& softmaxLayer = *dynamic_cast<SoftmaxLayer*>(&_convNet->getLayer(_softmaxLayerIdx));
// softmaxLayer.setDoLogregGrad(false);
// Cost& batchCost = *new Cost(0);
// for (int i = 0; i < _dp->getNumMinibatches(); i++) {
// _convNet->fprop(i, PASS_TEST);
// _convNet->getCost(batchCost);
// softmaxLayer.getActs().apply(NVMatrixOps::Log(), softmaxLayer.getActsGrad());
//
// softmaxLayer.getActsGrad().addScalar(1);
// softmaxLayer.getActsGrad().scale(-1);
// softmaxLayer.incRcvdBInputs();
// softmaxLayer.bprop(PASS_TEST);
//
// Matrix& miniDataGrads = _dataGrads->sliceRows(i * _dp->getMinibatchSize(),
// min(_dp->getNumCases(), (i + 1) * _dp->getMinibatchSize()));
// NVMatrix& grads = dataLayer.getActsGrad();
// NVMatrix grads_T;
// if (grads.isTrans()) {
// NVMatrix& soft_T = grads.getTranspose();
// soft_T.transpose(grads_T);
// delete &soft_T;
// } else {
// grads.transpose(grads_T);
// }
// grads_T.copyToHost(miniDataGrads);
// delete &miniDataGrads;
//
// _convNet->reset();
// }
// cudaThreadSynchronize();
// _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
}