Initial commit
This commit is contained in:
commit
9fdd561586
246 changed files with 58283 additions and 0 deletions
594
src/convnet.cu
Normal file
594
src/convnet.cu
Normal file
|
@ -0,0 +1,594 @@
|
|||
/*
|
||||
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <map>
|
||||
|
||||
#include <nvmatrix.cuh>
|
||||
#include <nvmatrix_operators.cuh>
|
||||
#include <matrix.h>
|
||||
#include <convnet.cuh>
|
||||
#include <util.cuh>
|
||||
|
||||
using namespace std;
|
||||
|
||||
/*
|
||||
* =======================
|
||||
* ConvNet
|
||||
* =======================
|
||||
*/
|
||||
ConvNet::ConvNet(PyObject* layerParams, intv& deviceIDs, vector<intv*>& deviceCPUs, int minibatchSize, int weightUpdateFreq) : Thread(false) {
|
||||
_weightUpdateFreq = weightUpdateFreq;
|
||||
_numBwdMiniPasses = 0;
|
||||
_deviceIDs = &deviceIDs;
|
||||
_deviceCPUs = &deviceCPUs;
|
||||
_data = NULL;
|
||||
_trainingProgress = 0;
|
||||
_sync = new ThreadSynchronizer(deviceIDs.size() + 1);
|
||||
seti pipeSet;
|
||||
pipeSet.insert(deviceIDs.begin(), deviceIDs.end());
|
||||
_pd = new PipeDispenserNonBlocking(pipeSet);
|
||||
PyObject* layerList = PyDict_Values(layerParams);
|
||||
|
||||
// Data layers live on the manager thread (in CPU memory)
|
||||
for (int i = 0; i < PyList_GET_SIZE(layerList); i++) {
|
||||
PyObject* paramsDict = PyList_GET_ITEM(layerList, i);
|
||||
string layerType = pyDictGetString(paramsDict, "type");
|
||||
if (layerType == "data") {
|
||||
DataLayer* d = new DataLayer(NULL, paramsDict);
|
||||
_dataLayers.push_back(d);
|
||||
_layerMap[d->getName()] = d;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize GPU worker threads
|
||||
for (int d = 0; d < deviceIDs.size(); ++d) {
|
||||
ConvNetGPU* cng = new ConvNetGPU(layerList, deviceIDs[d], *deviceCPUs[d], this);
|
||||
|
||||
_convNetThreads.push_back(cng);
|
||||
for (map<string, Layer*>::iterator it = cng->getLayerMap().begin(); it != cng->getLayerMap().end(); ++it) {
|
||||
_layerMap[it->first] = it->second;
|
||||
}
|
||||
}
|
||||
// Connect forward/backward links in graph
|
||||
for (map<string, Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
|
||||
PyObject* paramsDict = PyDict_GetItemString(layerParams, it->first.c_str());
|
||||
PyObject* inputList = PyDict_GetItemString(paramsDict, "inputs");
|
||||
if (inputList != NULL) {
|
||||
for (int i = 0; i < PyList_GET_SIZE(inputList); i++) {
|
||||
string inputName = PyString_AsString(PyList_GetItem(inputList, i));
|
||||
it->second->addPrev(_layerMap[inputName]);
|
||||
_layerMap[inputName]->addNext(it->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_numFwdTerminal = 0;
|
||||
// Execute post-initialization stuff
|
||||
for (map<string, Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
|
||||
it->second->postInit();
|
||||
_numFwdTerminal += it->second->getNext().size() == 0; // Number of terminal nodes going forward
|
||||
}
|
||||
// Find and count the terminal nodes in the backward pass
|
||||
set<string> visited, terminal;
|
||||
for (int t = 0; t < _convNetThreads.size(); t++) {
|
||||
vector<CostLayer*>& cl = _convNetThreads[t]->getCostLayers();
|
||||
for (int c = 0; c < cl.size(); c++) {
|
||||
findBwdTerminal(*cl[c], visited, terminal);
|
||||
}
|
||||
}
|
||||
_numBwdTerminal = terminal.size();
|
||||
// printf("num fwd terminals: %d, back terminals:\n", _numFwdTerminal);
|
||||
// for (set<string>::iterator it = terminal.begin(); it != terminal.end(); ++it) {
|
||||
// printf("%s\n", (*it).c_str());
|
||||
// }
|
||||
_dp = new DataProvider(minibatchSize);
|
||||
|
||||
Py_DECREF(layerList);
|
||||
assert(_weightUpdateFreq > 0);
|
||||
}
|
||||
|
||||
void ConvNet::findBwdTerminal(Layer& l, set<string>& visited, set<string> &terminal) {
|
||||
if (visited.count(l.getName()) == 0) {
|
||||
visited.insert(l.getName());
|
||||
if (l.isGradConsumer()) {
|
||||
bool hasPrevConsumer = false;
|
||||
for (int i = 0; i < l.getPrev().size(); i++) {
|
||||
hasPrevConsumer |= l.getPrev()[i]->isGradConsumer();
|
||||
}
|
||||
if (!hasPrevConsumer || !l.isGradProducer()) {
|
||||
terminal.insert(l.getName());
|
||||
l.setBwdTerminal(true);
|
||||
} else if (l.isGradProducer()) {
|
||||
for (int i = 0; i < l.getPrev().size(); i++) {
|
||||
if (l.getPrev()[i]->isGradConsumer()) {
|
||||
findBwdTerminal(*l.getPrev()[i], visited, terminal);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void* ConvNet::run() {
|
||||
// The manager thread defaults to using the GPU of the first worker.
|
||||
// Put more logic here if this is inappropriate.
|
||||
NVMatrix::setDeviceID(_convNetThreads[0]->getDeviceID());
|
||||
for (int t = 0; t < _convNetThreads.size(); t++) {
|
||||
_convNetThreads[t]->start();
|
||||
}
|
||||
copyToGPU();
|
||||
while (true) {
|
||||
Worker* worker = _workerQueue.dequeue();
|
||||
worker->run();
|
||||
delete worker;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Queue<Worker*>& ConvNet::getWorkerQueue() {
|
||||
return _workerQueue;
|
||||
}
|
||||
|
||||
Queue<WorkResult*>& ConvNet::getResultQueue() {
|
||||
return _resultQueue;
|
||||
}
|
||||
|
||||
DataProvider& ConvNet::getDataProvider() {
|
||||
return *_dp;
|
||||
}
|
||||
|
||||
Layer& ConvNet::operator[](string& name) {
|
||||
return *_layerMap[name];
|
||||
}
|
||||
|
||||
Layer& ConvNet::getLayer(string& name) {
|
||||
return *_layerMap[name];
|
||||
}
|
||||
|
||||
void ConvNet::sendMessage(MESSAGES msg, bool sync) {
|
||||
for (int i = 0; i < _convNetThreads.size(); i++) {
|
||||
_convNetThreads[i]->enqueueMessage(new Message(msg));
|
||||
if (sync) {
|
||||
_convNetThreads[i]->enqueueMessage(new Message(SYNC));
|
||||
}
|
||||
}
|
||||
|
||||
if (sync) {
|
||||
_sync->sync();
|
||||
}
|
||||
}
|
||||
|
||||
void ConvNet::copyToCPU() {
|
||||
sendMessage(COPY_TO_CPU, true);
|
||||
}
|
||||
|
||||
void ConvNet::copyToGPU() {
|
||||
sendMessage(COPY_TO_GPU, false);
|
||||
}
|
||||
|
||||
void ConvNet::updateWeights() {
|
||||
sendMessage(UPDATE_WEIGHTS, true);
|
||||
}
|
||||
|
||||
void ConvNet::reset() {
|
||||
sendMessage(RESET, false);
|
||||
}
|
||||
|
||||
void ConvNet::fprop(PASS_TYPE passType) {
|
||||
assert(_data != NULL);
|
||||
reset();
|
||||
for (int i = 0; i < _dataLayers.size(); i++) {
|
||||
_dataLayers[i]->startFprop(*_data, passType);
|
||||
}
|
||||
waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
|
||||
}
|
||||
|
||||
void ConvNet::fprop(CPUData& data, PASS_TYPE passType) {
|
||||
if (&data != _data) {
|
||||
delete _data;
|
||||
}
|
||||
_data = &data;
|
||||
fprop(passType);
|
||||
}
|
||||
|
||||
void ConvNet::fprop(int miniIdx, PASS_TYPE passType) {
|
||||
delete _data;
|
||||
reset();
|
||||
if (miniIdx == 0 || miniIdx != _dataLayers[0]->getBufferMinibatchIdx()) {
|
||||
_data = &_dp->getMinibatch(miniIdx);
|
||||
for (int i = 0; i < _dataLayers.size(); i++) {
|
||||
_dataLayers[i]->startFprop(*_data, passType);
|
||||
}
|
||||
} else {
|
||||
_data = _dataLayers[0]->getBufferData();
|
||||
for (int i = 0; i < _dataLayers.size(); i++) {
|
||||
_dataLayers[i]->startFpropFromBuffer(passType);
|
||||
}
|
||||
}
|
||||
CPUData* nextData = miniIdx + 1 == _dp->getNumMinibatches() ? NULL : &_dp->getMinibatch(miniIdx + 1);
|
||||
if (nextData != NULL) {
|
||||
for (int i = 0; i < _dataLayers.size(); i++) {
|
||||
_dataLayers[i]->setBuffer(*nextData, miniIdx + 1);
|
||||
}
|
||||
}
|
||||
waitForTerminals(_numFwdTerminal, FPROP_TERMINAL);
|
||||
}
|
||||
|
||||
void ConvNet::bprop(PASS_TYPE passType) {
|
||||
// Weights are updated when this is zero
|
||||
_numBwdMiniPasses = (_numBwdMiniPasses + 1) % _weightUpdateFreq;
|
||||
for (int i = 0; i < _convNetThreads.size(); i++) {
|
||||
_convNetThreads[i]->enqueueMessage(new BpropStartMessage(passType));;
|
||||
}
|
||||
waitForTerminals(_numBwdTerminal, BPROP_TERMINAL);
|
||||
reset();
|
||||
}
|
||||
|
||||
void ConvNet::waitForTerminals(int numMsgs, MESSAGES msg) {
|
||||
int terminalsDone = 0;
|
||||
while(terminalsDone++ < numMsgs) {
|
||||
Message* m = _msgQueue.dequeue();
|
||||
assert(m->getMessageType() == msg);
|
||||
delete m;
|
||||
}
|
||||
}
|
||||
|
||||
// Same as getCost() but adds results to given cost and returns it
|
||||
Cost& ConvNet::getCost(Cost& cost) {
|
||||
Cost &tmp = getCost();
|
||||
cost += tmp;
|
||||
delete &tmp;
|
||||
return cost;
|
||||
}
|
||||
|
||||
Cost& ConvNet::getCost() {
|
||||
Cost &tmp = *new Cost(_data->getNumCases());
|
||||
for (int i = 0; i < _convNetThreads.size(); i++) {
|
||||
Cost& tmp2 = _convNetThreads[i]->getCost(_data->getNumCases());
|
||||
tmp |= tmp2;
|
||||
delete &tmp2;
|
||||
}
|
||||
return tmp;
|
||||
}
|
||||
|
||||
double ConvNet::getCostValue() {
|
||||
Cost& cost = getCost();
|
||||
double val = cost.getValue();
|
||||
delete &cost;
|
||||
return val;
|
||||
}
|
||||
|
||||
Queue<Message*>& ConvNet::getMessageQueue() {
|
||||
return _msgQueue;
|
||||
}
|
||||
|
||||
int ConvNet::getDeviceID(int gpuIdx) {
|
||||
if (gpuIdx < 0) {
|
||||
return -1;
|
||||
}
|
||||
return _deviceIDs->at(gpuIdx);
|
||||
}
|
||||
|
||||
intv& ConvNet::getDeviceIDs() {
|
||||
return *_deviceIDs;
|
||||
}
|
||||
|
||||
ThreadSynchronizer& ConvNet::getSync() {
|
||||
return *_sync;
|
||||
}
|
||||
|
||||
PipeDispenser& ConvNet::getPipeDispenser() {
|
||||
return *_pd;
|
||||
}
|
||||
|
||||
void ConvNet::syncWithChildren() {
|
||||
sendMessage(SYNC, false);
|
||||
_sync->sync();
|
||||
}
|
||||
|
||||
int ConvNet::getWeightUpdateFreq() {
|
||||
return _weightUpdateFreq;
|
||||
}
|
||||
|
||||
int ConvNet::getNumBwdMiniPasses() {
|
||||
return _numBwdMiniPasses;
|
||||
}
|
||||
|
||||
int ConvNet::getMinibatchSize() {
|
||||
return _dp->getMinibatchSize();
|
||||
}
|
||||
|
||||
void ConvNet::setTrainingProgress(double progress) {
|
||||
_trainingProgress = progress;
|
||||
}
|
||||
|
||||
double ConvNet::getTrainingProgress() const {
|
||||
return _trainingProgress;
|
||||
}
|
||||
|
||||
/*
|
||||
* Gradient checking stuff
|
||||
*/
|
||||
void ConvNet::checkGradients() {
|
||||
_numFailures = 0;
|
||||
_numTests = 0;
|
||||
fprop(0, PASS_GC);
|
||||
_baseErr = getCostValue();
|
||||
bprop(PASS_GC);
|
||||
|
||||
for (map<string, Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
|
||||
if (it->second->getDeviceID() >= 0) {
|
||||
NVMatrix::setDeviceID(it->second->getDeviceID());
|
||||
it->second->checkGradients();
|
||||
}
|
||||
}
|
||||
NVMatrix::setDeviceID(_convNetThreads[0]->getDeviceID());
|
||||
|
||||
cout << "------------------------" << endl;
|
||||
if (_numFailures > 0) {
|
||||
cout << _numFailures << "/" << _numTests << " TESTS FAILED" << endl;
|
||||
} else {
|
||||
cout << "ALL " << _numTests << " TESTS PASSED" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* name: weight matrix name
|
||||
* eps: finite difference step
|
||||
*/
|
||||
bool ConvNet::checkGradient(const string& name, float eps, Weights& weights) {
|
||||
Matrix numGrad(weights.getNumRows(), weights.getNumCols());
|
||||
Matrix diff(numGrad);
|
||||
numGrad.apply(Matrix::ZERO);
|
||||
Matrix weightsCPU;
|
||||
|
||||
weights.getW().copyToHost(weightsCPU, true);
|
||||
|
||||
for(int i = 0; i < weights.getNumRows(); i++) {
|
||||
for (int j = 0; j < weights.getNumCols(); j++) {
|
||||
float v = weightsCPU(i,j);
|
||||
weightsCPU(i,j) += eps;
|
||||
weights.getW().copyFromHost(weightsCPU);
|
||||
weightsCPU(i,j) = v;
|
||||
fprop(PASS_GC);
|
||||
double err = getCostValue();
|
||||
numGrad(i,j) = (err - _baseErr) / (_data->getNumCases() * eps);
|
||||
if (isnan(numGrad(i,j)) || isinf(numGrad(i,j))) {
|
||||
cout << "Numerical computation produced nan or inf when checking '" << name << "': " << numGrad(i,j) << endl;
|
||||
cout << "Consider reducing the sizes of the weights or finite difference steps." << endl;
|
||||
cout << "Exiting." << endl;
|
||||
exit(1);
|
||||
}
|
||||
weights.getW().copyFromHost(weightsCPU);
|
||||
}
|
||||
}
|
||||
Matrix gradCPU;
|
||||
weights.getGrad().copyToHost(gradCPU, true);
|
||||
gradCPU.scale(-1.0 / _data->getNumCases());
|
||||
float analNorm = gradCPU.norm();
|
||||
float numNorm = numGrad.norm();
|
||||
numGrad.subtract(gradCPU, diff);
|
||||
float relErr = diff.norm() / analNorm;
|
||||
bool fail = relErr >= GC_REL_ERR_THRESH;
|
||||
if (fail || !GC_SUPPRESS_PASSES) {
|
||||
cout << "========================" << endl;
|
||||
printf("(%s) %s GRADIENT CHECK\n", fail ? "****FAIL****" : "PASS", name.c_str());
|
||||
cout << "========================" << endl;
|
||||
cout << "Analytic:" << endl;
|
||||
gradCPU.print(0, 6, 0, 4);
|
||||
cout << "Numeric:" << endl;
|
||||
numGrad.print(0, 6, 0, 4);
|
||||
printf("Analytic norm: %e\n", analNorm);
|
||||
printf("Numeric norm: %e\n", numNorm);
|
||||
printf("Relative error: %e\n", relErr);
|
||||
}
|
||||
_numTests++;
|
||||
_numFailures += fail;
|
||||
return fail;
|
||||
}
|
||||
|
||||
/*
|
||||
* =======================
|
||||
* ConvNetGPU
|
||||
* =======================
|
||||
*/
|
||||
ConvNetGPU::ConvNetGPU(PyObject* layerList, int deviceID, intv& deviceCPUs, ConvNet* convNet)
|
||||
: Thread(false, deviceCPUs), _deviceID(deviceID), _convNet(convNet) {
|
||||
try {
|
||||
int numLayers = PyList_GET_SIZE(layerList);
|
||||
|
||||
for (int i = 0; i < numLayers; i++) {
|
||||
PyObject* paramsDict = PyList_GET_ITEM(layerList, i);
|
||||
int layerDeviceID = convNet->getDeviceID(pyDictGetInt(paramsDict, "gpu"));
|
||||
if (layerDeviceID == _deviceID) {
|
||||
initLayer(paramsDict);
|
||||
}
|
||||
}
|
||||
} catch (string& s) {
|
||||
cout << "Error creating ConvNet: " << s << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
void ConvNetGPU::initLayer(PyObject* paramsDict) {
|
||||
string type = pyDictGetString(paramsDict, "type");
|
||||
string name = pyDictGetString(paramsDict, "name");
|
||||
if (type == "fc") {
|
||||
_layerMap[name] = new FCLayer(this, paramsDict, false, true);
|
||||
} else if (type == "treefc") {
|
||||
_layerMap[name] = new TreeFCLayer(this, paramsDict);
|
||||
} else if (type == "conv") {
|
||||
_layerMap[name] = new ConvLayer(this, paramsDict);
|
||||
} else if (type == "local") {
|
||||
_layerMap[name] = new LocalUnsharedLayer(this, paramsDict);
|
||||
} else if (type == "pool") {
|
||||
_layerMap[name] = &PoolLayer::makePoolLayer(this, paramsDict);
|
||||
} else if (type == "rnorm") {
|
||||
_layerMap[name] = new ResponseNormLayer(this, paramsDict);
|
||||
} else if (type == "cmrnorm") {
|
||||
_layerMap[name] = new CrossMapResponseNormLayer(this, paramsDict);
|
||||
} else if (type == "cnorm") {
|
||||
_layerMap[name] = new ContrastNormLayer(this, paramsDict);
|
||||
} else if (type == "softmax") {
|
||||
_layerMap[name] = new SoftmaxLayer(this, paramsDict);
|
||||
} else if (type == "eltsum") {
|
||||
_layerMap[name] = new EltwiseSumLayer(this, paramsDict);
|
||||
} else if (type == "eltmax") {
|
||||
_layerMap[name] = new EltwiseMaxLayer(this, paramsDict);
|
||||
} else if (type == "neuron") {
|
||||
_layerMap[name] = new NeuronLayer(this, paramsDict);
|
||||
} else if (type == "nailbed") {
|
||||
_layerMap[name] = new NailbedLayer(this, paramsDict);
|
||||
} else if (type == "blur") {
|
||||
_layerMap[name] = new GaussianBlurLayer(this, paramsDict);
|
||||
} else if (type == "href") {
|
||||
_layerMap[name] = new HorizontalReflectionLayer(this, paramsDict);
|
||||
} else if (type == "resize") {
|
||||
_layerMap[name] = new ResizeLayer(this, paramsDict);
|
||||
} else if (type == "rgb2yuv") {
|
||||
_layerMap[name] = new RGBToYUVLayer(this, paramsDict);
|
||||
} else if (type == "rgb2lab") {
|
||||
_layerMap[name] = new RGBToLABLayer(this, paramsDict);
|
||||
} else if (type == "rscale") {
|
||||
_layerMap[name] = new RandomScaleLayer(this, paramsDict);
|
||||
} else if (type == "concat") {
|
||||
_layerMap[name] = new ConcatenationLayer(this, paramsDict);
|
||||
} else if (type == "hs") {
|
||||
_layerMap[name] = new HiddenSexLayer(this, paramsDict);
|
||||
} else if (strncmp(type.c_str(), "cost.", 5) == 0) {
|
||||
CostLayer *c = &CostLayer::makeCostLayer(this, type, paramsDict);
|
||||
_layerMap[name] = c;
|
||||
_costs.push_back(c);
|
||||
} else {
|
||||
throw string("Unknown layer type ") + type;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This executes in a new CPU thread so it's OK to initialize CUDA stuff here.
|
||||
*/
|
||||
void ConvNetGPU::initCuda() {
|
||||
NVMatrix::setDeviceID(_deviceID);
|
||||
checkCudaErrors(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
|
||||
for (int i = 0; i < _convNet->getDeviceIDs().size(); i++) {
|
||||
int d = _convNet->getDeviceID(i);
|
||||
if (d != _deviceID) {
|
||||
if (NVMatrix::canAccessDevice(_deviceID, d)) {
|
||||
printf("Enabling peer access %d --> %d\n", NVMatrix::getDeviceID(), d);
|
||||
checkCudaErrors(cudaDeviceEnablePeerAccess(d, 0));
|
||||
} else {
|
||||
printf("No peer access %d --> %d\n", _deviceID, d);
|
||||
}
|
||||
}
|
||||
}
|
||||
NVMatrix::initCublas();
|
||||
NVMatrix::initRandom();
|
||||
srand(time(0));
|
||||
}
|
||||
|
||||
void* ConvNetGPU::run() {
|
||||
initCuda();
|
||||
|
||||
while (true) {
|
||||
Message* m = _msgQueue.dequeue();
|
||||
if (m->getMessageType() == FPROP_READY) {
|
||||
FpropMessage* msg = static_cast<FpropMessage*>(m);
|
||||
_layerMap[msg->getToLayer()]->fprop(msg->getPassType());
|
||||
} else if (m->getMessageType() == BPROP_READY) {
|
||||
BpropMessage* msg = static_cast<BpropMessage*>(m);
|
||||
_layerMap[msg->getToLayer()]->incRcvdBInputMsgs();
|
||||
_layerMap[msg->getToLayer()]->bprop(msg->getPassType());
|
||||
} else if (m->getMessageType() == BPROP_START) {
|
||||
BpropStartMessage* msg = static_cast<BpropStartMessage*>(m);
|
||||
for (int i = 0; i < _costs.size(); i++) {
|
||||
dynamic_cast<Layer*>(_costs[i])->bprop(msg->getPassType());
|
||||
}
|
||||
} else if (m->getMessageType() == SYNC) {
|
||||
_convNet->getSync().sync();
|
||||
} else if (m->getMessageType() == COPY_TO_CPU) {
|
||||
for (map<string,Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
|
||||
it->second->copyToCPU();
|
||||
}
|
||||
} else if (m->getMessageType() == COPY_TO_GPU) {
|
||||
for (map<string,Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
|
||||
it->second->copyToGPU();
|
||||
}
|
||||
} else if (m->getMessageType() == RESET) {
|
||||
for (map<string,Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
|
||||
it->second->reset();
|
||||
}
|
||||
} else if (m->getMessageType() == UPDATE_WEIGHTS) {
|
||||
for (map<string,Layer*>::iterator it = _layerMap.begin(); it != _layerMap.end(); ++it) {
|
||||
it->second->updateWeights();
|
||||
}
|
||||
} else if (m->getMessageType() == RUNME) {
|
||||
RunMeMessage* msg = static_cast<RunMeMessage*>(m);
|
||||
msg->run();
|
||||
}
|
||||
delete m;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Cost& ConvNetGPU::getCost(int numCases) {
|
||||
return *new Cost(numCases, _costs);
|
||||
}
|
||||
|
||||
Layer& ConvNetGPU::operator[](string& name) {
|
||||
return *_layerMap[name];
|
||||
}
|
||||
|
||||
Layer& ConvNetGPU::getLayer(string& name) {
|
||||
return *_layerMap[name];
|
||||
}
|
||||
|
||||
int ConvNetGPU::getDeviceID() {
|
||||
return _deviceID;
|
||||
}
|
||||
|
||||
Queue<Message*>& ConvNetGPU::getMessageQueue() {
|
||||
return _msgQueue;
|
||||
}
|
||||
|
||||
void ConvNetGPU::enqueueMessage(Message* msg) {
|
||||
getMessageQueue().enqueue(msg);
|
||||
}
|
||||
|
||||
vector<CostLayer*>& ConvNetGPU::getCostLayers() {
|
||||
return _costs;
|
||||
}
|
||||
|
||||
map<string, Layer*>& ConvNetGPU::getLayerMap() {
|
||||
return _layerMap;
|
||||
}
|
||||
|
||||
ConvNet& ConvNetGPU::getConvNet() {
|
||||
return *_convNet;
|
||||
}
|
126
src/cost.cu
Normal file
126
src/cost.cu
Normal file
|
@ -0,0 +1,126 @@
|
|||
/*
|
||||
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <cost.cuh>
|
||||
|
||||
using namespace std;
|
||||
|
||||
/*
|
||||
* =====================
|
||||
* Cost
|
||||
* =====================
|
||||
*/
|
||||
|
||||
Cost::Cost(int numCases) : _numCases(numCases) {
|
||||
}
|
||||
|
||||
Cost::Cost(int numCases, vector<CostLayer*>& costs) : _numCases(numCases) {
|
||||
for (vector<CostLayer*>::iterator it = costs.begin(); it != costs.end(); ++it) {
|
||||
_costMap[(*it)->getName()] = &(*it)->getCost();
|
||||
_costCoeffMap[(*it)->getName()] = (*it)->getCoeff();
|
||||
}
|
||||
}
|
||||
|
||||
int Cost::getNumCases() {
|
||||
return _numCases;
|
||||
}
|
||||
|
||||
doublev& Cost::operator [](const string s) {
|
||||
return *_costMap[s];
|
||||
}
|
||||
|
||||
CostMap& Cost::getCostMap() {
|
||||
return _costMap;
|
||||
}
|
||||
|
||||
CostCoeffMap& Cost::getCostCoeffMap() {
|
||||
return _costCoeffMap;
|
||||
}
|
||||
|
||||
double Cost::getValue() {
|
||||
double val = 0;
|
||||
for (CostMap::iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
|
||||
val += _costCoeffMap[it->first] * it->second->at(0);
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
Cost& Cost::operator += (Cost& er) {
|
||||
CostMap& otherMap = er.getCostMap();
|
||||
CostCoeffMap& otherCoeffMap = er.getCostCoeffMap();
|
||||
for (CostMap::const_iterator it = otherMap.begin(); it != otherMap.end(); ++it) {
|
||||
if (_costMap.count(it->first) == 0) {
|
||||
_costMap[it->first] = new doublev();
|
||||
_costCoeffMap[it->first] = otherCoeffMap[it->first];
|
||||
}
|
||||
|
||||
vector<double>& myVec = *_costMap[it->first];
|
||||
vector<double>& otherVec = *otherMap[it->first];
|
||||
assert(myVec.size() == 0 || myVec.size() == otherVec.size());
|
||||
for (int i = 0; i < otherVec.size(); i++) {
|
||||
if (myVec.size() <= i) {
|
||||
myVec.push_back(0);
|
||||
}
|
||||
myVec[i] += otherVec[i];
|
||||
}
|
||||
}
|
||||
_numCases += er.getNumCases();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Merge costs in er into this cost
|
||||
Cost& Cost::operator |= (Cost& er) {
|
||||
assert(er.getNumCases() == getNumCases());
|
||||
CostMap& otherMap = er.getCostMap();
|
||||
CostCoeffMap& otherCoeffMap = er.getCostCoeffMap();
|
||||
for (CostMap::const_iterator it = otherMap.begin(); it != otherMap.end(); ++it) {
|
||||
assert(_costMap.count(it->first) == 0);
|
||||
|
||||
_costMap[it->first] = new doublev();
|
||||
_costCoeffMap[it->first] = otherCoeffMap[it->first];
|
||||
|
||||
vector<double>& myVec = *_costMap[it->first];
|
||||
vector<double>& otherVec = *otherMap[it->first];
|
||||
myVec.insert(myVec.begin(), otherVec.begin(), otherVec.end());
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
Cost& Cost::operator /= (const double v) {
|
||||
for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
|
||||
for (doublev::iterator it2 = it->second->begin(); it2 != it->second->end(); ++it2) {
|
||||
*it2 /= v;
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
Cost::~Cost() {
|
||||
for (CostMap::const_iterator it = _costMap.begin(); it != _costMap.end(); ++it) {
|
||||
delete it->second;
|
||||
}
|
||||
}
|
65
src/cpuCNN.cu
Normal file
65
src/cpuCNN.cu
Normal file
|
@ -0,0 +1,65 @@
|
|||
|
||||
|
||||
|
||||
#include "softmaxtree.cuh"
|
||||
/*
|
||||
* weights: (numNodes, numFeatures)
|
||||
* targets: (numNodes, numFeatures)
|
||||
*
|
||||
*/
|
||||
void cpuSoftmaxTreeFwd(float* weights, float* targets, const int numFeatures, SoftmaxTree& tree) {
|
||||
for (int d = 0; d <= tree.getDepth(); ++d) {
|
||||
for (SoftmaxNodeV::iterator it = tree.getNodesAtDepth(d).begin(); it!= tree.getNodesAtDepth(d).end(); ++it) {
|
||||
SoftmaxNode& node = **it;
|
||||
SoftmaxNode* parent = node.getParent();
|
||||
for (int f = 0; f < numFeatures; ++f) {
|
||||
targets[node.getLabel() * numFeatures + f] = weights[node.getLabel() * numFeatures + f]
|
||||
+ (parent == NULL ? 0 : targets[parent->getLabel() * numFeatures + f]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* grads: (numNodes, numFeatures)
|
||||
*
|
||||
*/
|
||||
void cpuSoftmaxTreeBwd(float* grads, const int numFeatures, SoftmaxTree& tree) {
|
||||
for (int h = 1; h <= tree.getHeight(); ++h) {
|
||||
for (SoftmaxNodeV::iterator it = tree.getNodesAtHeight(h).begin(); it!= tree.getNodesAtHeight(h).end(); ++it) {
|
||||
SoftmaxNode& node = **it;
|
||||
for (int f = 0; f < numFeatures; ++f) {
|
||||
grads[node.getLabel() * numFeatures + f] = 0;
|
||||
}
|
||||
for (SoftmaxNodeV::iterator itc = node.getChildren().begin(); itc!= node.getChildren().end(); ++itc) {
|
||||
SoftmaxNode& child = **itc;
|
||||
for (int f = 0; f < numFeatures; ++f) {
|
||||
grads[node.getLabel() * numFeatures + f] += grads[child.getLabel() * numFeatures + f];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* weights: (numNodes, numFeatures)
|
||||
* weightsInc: (numNodes, numFeatures)
|
||||
* weightsGrad: (numNodes, numFeatures)
|
||||
* nodeSizes: numNodes-array whose ith element gives number of leaves under
|
||||
* node with label i.
|
||||
*/
|
||||
void cpuSoftmaxTreeUpdateWeights(float* weights, float* weightsInc, float* weightsGrad,
|
||||
const int numFeatures, float eps, const float mom, float wc, SoftmaxTree& tree) {
|
||||
for (int d = 0; d <= tree.getDepth(); d++) {
|
||||
for (SoftmaxNodeV::iterator it = tree.getNodesAtDepth(d).begin(); it!= tree.getNodesAtDepth(d).end(); ++it) {
|
||||
SoftmaxNode& node = **it;
|
||||
float w = wc / node.getSize();
|
||||
float e = eps;// * sqrt(node.getSize());
|
||||
for (int f = 0; f < numFeatures; ++f) {
|
||||
weightsInc[node.getLabel() * numFeatures + f] = mom * weightsInc[node.getLabel() * numFeatures + f]
|
||||
+ e * (weightsGrad[node.getLabel() * numFeatures + f] - w * weights[node.getLabel() * numFeatures + f]);
|
||||
weights[node.getLabel() * numFeatures + f] += weightsInc[node.getLabel() * numFeatures + f];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
98
src/data.cu
Normal file
98
src/data.cu
Normal file
|
@ -0,0 +1,98 @@
|
|||
/*
|
||||
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <data.cuh>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
DataProvider::DataProvider(int minibatchSize) :
|
||||
_minibatchSize(minibatchSize), _hData(NULL) {
|
||||
|
||||
}
|
||||
|
||||
void DataProvider::clearData() {
|
||||
delete _hData;
|
||||
_hData = NULL;
|
||||
}
|
||||
|
||||
void DataProvider::setData(CPUData& hData) {
|
||||
// This is now deleted by the DataWorker's destructor
|
||||
// delete _hData; // Delete old CPU matrices
|
||||
|
||||
_hData = &hData;
|
||||
}
|
||||
|
||||
CPUData& DataProvider::getMinibatch(int idx) {
|
||||
assert(idx >= 0 && idx < getNumMinibatches());
|
||||
return getDataSlice(idx * _minibatchSize, (idx + 1) * _minibatchSize);
|
||||
}
|
||||
|
||||
CPUData& DataProvider::getDataSlice(int startCase, int endCase) {
|
||||
assert(_hData != NULL);
|
||||
assert(_hData->getNumCases() > 0);
|
||||
endCase = min(_hData->getNumCases(), endCase);
|
||||
// TODO: maintain these matrices, no point re-creating them all the time
|
||||
MatrixV& miniData = *new MatrixV();
|
||||
|
||||
for (int i = 0; i < _hData->getData().size(); i++) {
|
||||
// NOTE: if hData is transposed, then the output minibatch matrix
|
||||
// can be a view. No need to allocate new CPU memory here. Might
|
||||
// want to look into optimizing that in the future, though it's
|
||||
// unlikely to be a big deal.
|
||||
if (_hData->isTrans()) {
|
||||
miniData.push_back(&(*_hData)[i].sliceCols(startCase, endCase));
|
||||
} else {
|
||||
miniData.push_back(new Matrix());
|
||||
(*_hData)[i].sliceCols(startCase, endCase, *miniData.back());
|
||||
}
|
||||
}
|
||||
return *new CPUData(&miniData);
|
||||
}
|
||||
|
||||
int DataProvider::getNumMinibatches() {
|
||||
assert(_hData != NULL);
|
||||
assert(_hData->getNumCases() > 0);
|
||||
return DIVUP(_hData->getNumCases(), _minibatchSize);
|
||||
}
|
||||
|
||||
int DataProvider::getMinibatchSize() {
|
||||
return _minibatchSize;
|
||||
}
|
||||
|
||||
int DataProvider::getNumCases() {
|
||||
assert(_hData != NULL);
|
||||
assert(_hData->getNumCases() > 0);
|
||||
return _hData->getNumCases();
|
||||
}
|
||||
|
||||
int DataProvider::getNumCasesInMinibatch(int idx) {
|
||||
assert(_hData != NULL);
|
||||
assert(_hData->getNumCases() > 0);
|
||||
assert(idx >= 0 && idx < getNumMinibatches());
|
||||
return min(_minibatchSize, max(0, _hData->getNumCases() - idx * _minibatchSize));
|
||||
}
|
34
src/hostmem.cu
Normal file
34
src/hostmem.cu
Normal file
|
@ -0,0 +1,34 @@
|
|||
#include <hostmem.cuh>
|
||||
|
||||
PinnedHostMem::PinnedHostMem() : _numBytes(0), _data(NULL) {
|
||||
|
||||
}
|
||||
|
||||
PinnedHostMem::~PinnedHostMem() {
|
||||
if (_numBytes > 0) {
|
||||
checkCudaErrors(cudaFreeHost(_data));
|
||||
}
|
||||
}
|
||||
|
||||
void PinnedHostMem::resize(uint bytes) {
|
||||
if (_numBytes != bytes) {
|
||||
if (_numBytes > 0) {
|
||||
checkCudaErrors(cudaFreeHost(_data));
|
||||
}
|
||||
checkCudaErrors(cudaHostAlloc(&_data, bytes, cudaHostAllocPortable));
|
||||
_numBytes = bytes;
|
||||
}
|
||||
}
|
||||
|
||||
void PinnedHostMem::copyFrom(void* src, uint bytes) {
|
||||
resize(bytes);
|
||||
checkCudaErrors(cudaMemcpy(_data, src, bytes, cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
void PinnedHostMem::copyTo(void* dst) {
|
||||
checkCudaErrors(cudaMemcpy(dst, _data, _numBytes, cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
void* PinnedHostMem::getData() {
|
||||
return _data;
|
||||
}
|
2002
src/layer.cu
Normal file
2002
src/layer.cu
Normal file
File diff suppressed because it is too large
Load diff
720
src/layer_kernels.cu
Normal file
720
src/layer_kernels.cu
Normal file
|
@ -0,0 +1,720 @@
|
|||
/*
|
||||
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
#include <layer_kernels.cuh>
|
||||
|
||||
|
||||
//#define LOG(x) ((x) > 0.0 ? log(x) : -1000.0)
|
||||
|
||||
// Computes log(exp(x) + exp(y))
|
||||
//#define LOGADD(x, y) ()
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
/*
|
||||
* E = -log(y_t)
|
||||
* probs: (numOut, numCases)
|
||||
* energies: (numOut, numCases)
|
||||
* labels: (1, numCases)
|
||||
* maxEnergies: (1, numCases)
|
||||
* labelLogProbs: (1, numCases) (*out)
|
||||
* correctProbs: (1, numCases) (*out)
|
||||
* top5Probs: (1, numCases) (*out)
|
||||
*
|
||||
* target: (1, numCases)
|
||||
*
|
||||
* This routine uses energeis to determine top-1 score because they're more accurate than top-n
|
||||
* probabilities, which have numerical errors in them.
|
||||
*/
|
||||
__global__ void kMultiSoftmaxCost_engs(float* probs, float* energies, float* labels, float* maxEnergies,
|
||||
float* labelLogProbs, float* correctProbs, float* top5Probs,
|
||||
const int numCases, const int numOut, const int setSize) {
|
||||
const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
|
||||
|
||||
if (tx < numCases) {
|
||||
const int label = int(labels[tx]);
|
||||
const float maxe = maxEnergies[tx];
|
||||
const float labelp = probs[label * numCases + tx];
|
||||
const float labele = energies[label * numCases + tx];
|
||||
|
||||
labelLogProbs[tx] = __logf(labelp);
|
||||
|
||||
int numBiggerEnergies = 0, numEqualsEnergies = 0;
|
||||
for (int i = 0; i < numOut; ++i) {
|
||||
numBiggerEnergies += energies[i * numCases + tx] > labele;
|
||||
numEqualsEnergies += energies[i * numCases + tx] == labele;
|
||||
}
|
||||
|
||||
const int slotsLeft = setSize - numBiggerEnergies;
|
||||
|
||||
top5Probs[tx] = slotsLeft <= 0 ? 0 : (numEqualsEnergies <= slotsLeft ? 1 : float(slotsLeft) / numEqualsEnergies);
|
||||
// if (numEqualsEnergies != 1) {
|
||||
// printf("numEqualsEnergies: %d, labelp: %e, maxp: %e\n", numEqualsEnergies, labelp, maxe);
|
||||
// }
|
||||
correctProbs[tx] = labele != maxe ? 0.0f : 1.0f / float(numEqualsEnergies);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* E = -log(y_t)
|
||||
* probs: (numOut, numCases)
|
||||
* labels: (1, numCases)
|
||||
* maxEnergies: (1, numCases)
|
||||
* labelLogProbs: (1, numCases) (*out)
|
||||
* correctProbs: (1, numCases) (*out)
|
||||
* top5Probs: (1, numCases) (*out)
|
||||
*
|
||||
* target: (1, numCases)
|
||||
*
|
||||
*/
|
||||
__global__ void kMultiSoftmaxCost(float* probs, float* labels, float* maxProbs,
|
||||
float* labelLogProbs, float* correctProbs, float* top5Probs,
|
||||
const int numCases, const int numOut, const int setSize) {
|
||||
const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
|
||||
|
||||
if (tx < numCases) {
|
||||
const int label = int(labels[tx]);
|
||||
const float maxp = maxProbs[tx];
|
||||
const float labelp = probs[label * numCases + tx];
|
||||
|
||||
labelLogProbs[tx] = __logf(labelp);
|
||||
|
||||
int numBiggerProbs = 0, numEqualsProbs = 0;
|
||||
for (int i = 0; i < numOut; ++i) {
|
||||
numBiggerProbs += probs[i * numCases + tx] > labelp;
|
||||
numEqualsProbs += probs[i * numCases + tx] == labelp;
|
||||
}
|
||||
|
||||
const int slotsLeft = setSize - numBiggerProbs;
|
||||
|
||||
top5Probs[tx] = slotsLeft <= 0.0f ? 0.0f : (numEqualsProbs <= slotsLeft ? 1.0f : float(slotsLeft) / numEqualsProbs);
|
||||
correctProbs[tx] = labelp != maxp ? 0.0f : 1.0f / float(numEqualsProbs);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* E = -log(y_t)
|
||||
* probs: (numOut, numCases)
|
||||
* labels: (1, numCases)
|
||||
* maxProbs: (1, numCases)
|
||||
* labelLogProbs: (1, numCases) (*out)
|
||||
* correctProbs: (1, numCases) (*out)
|
||||
* top5Probs: (1, numCases) (*out)
|
||||
*
|
||||
* target: (1, numCases) == log(y_l[labels,:]
|
||||
*/
|
||||
void computeMultiSoftmaxCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& energies, NVMatrix& labelLogProbs_out,
|
||||
NVMatrix& correctProbs_out, NVMatrix& top5Probs_out, int setSize, bool useEnergies) {
|
||||
int numCases = probs.getNumCols();
|
||||
int numOut = probs.getNumRows();
|
||||
|
||||
assert(labels.getNumElements() == numCases);
|
||||
assert(!labels.isTrans());
|
||||
assert(!probs.isTrans());
|
||||
assert(labels.isContiguous());
|
||||
assert(probs.isContiguous());
|
||||
assert(energies.isContiguous());
|
||||
assert(energies.isSameDims(probs));
|
||||
|
||||
// NVMatrix& maxProbs = probs.max(0);
|
||||
NVMatrix& maxPE = useEnergies ? energies.max(0) : probs.max(0);
|
||||
|
||||
labelLogProbs_out.resize(1, numCases);
|
||||
correctProbs_out.resize(1, numCases);
|
||||
top5Probs_out.resize(1, numCases);
|
||||
dim3 threads(LOGREG_ERR_THREADS_X, 1);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
|
||||
|
||||
if (useEnergies) {
|
||||
cudaFuncSetCacheConfig(kMultiSoftmaxCost_engs, cudaFuncCachePreferL1);
|
||||
kMultiSoftmaxCost_engs<<<blocks, threads>>>(probs.getDevData(), energies.getDevData(), labels.getDevData(), maxPE.getDevData(),
|
||||
labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), top5Probs_out.getDevData(),
|
||||
numCases, numOut, setSize);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kMultiSoftmaxCost, cudaFuncCachePreferL1);
|
||||
kMultiSoftmaxCost<<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), maxPE.getDevData(),
|
||||
labelLogProbs_out.getDevData(), correctProbs_out.getDevData(), top5Probs_out.getDevData(),
|
||||
numCases, numOut, setSize);
|
||||
}
|
||||
|
||||
getLastCudaError("computeLogregCost: Kernel execution failed");
|
||||
// cudaThreadSynchronize();
|
||||
delete &maxPE;
|
||||
}
|
||||
|
||||
/*
|
||||
* energies: (numCases, numOut) (yes this is weird)
|
||||
* bLattice: (numOut + 1, setSize, numCases) (*out)
|
||||
*
|
||||
* This is intended to work for cases when setSize <= 32.
|
||||
* Block size (y, x) = (1, B_X)
|
||||
*
|
||||
* NOTE:
|
||||
* B_X must be a multiple of 32
|
||||
*/
|
||||
template <int B_X>
|
||||
__global__ void kMSMBackward(float* energies, float* bLattice, const int numCases, const int numOut, const int setSize) {
|
||||
extern __shared__ float shmem[];
|
||||
const int tidx = blockIdx.x * B_X + threadIdx.x;
|
||||
const int casesPerWarp = 32 / setSize;
|
||||
const int casesPerBlock = casesPerWarp * B_X / 32;
|
||||
const int numWorkersPerWarp = casesPerWarp * setSize;
|
||||
const int tidxInWarp = tidx % 32;
|
||||
const int warpIdx = tidx / 32;
|
||||
const int blockCaseIdx = blockIdx.x * casesPerBlock;
|
||||
const int caseIdxInBlock = threadIdx.x / setSize;
|
||||
const int caseIdx = warpIdx * casesPerWarp + tidxInWarp / setSize;
|
||||
const bool doWork = tidxInWarp < numWorkersPerWarp && caseIdx < numCases;
|
||||
|
||||
const int bIdx = threadIdx.x + threadIdx.x/setSize + 1;
|
||||
volatile float* B = shmem;
|
||||
volatile float* shE = &shmem[B_X + casesPerBlock]; // Dimensions (casesPerBlock, 32 + 1)
|
||||
|
||||
const int loadY = warpIdx;
|
||||
const int loadX = tidxInWarp;
|
||||
|
||||
energies += (blockCaseIdx + loadY) * numOut + loadX;
|
||||
bLattice += tidx;
|
||||
if (blockIdx.x != 0) {
|
||||
return;
|
||||
}
|
||||
// The first row of the lattice has a 1 in the columns corresponding to
|
||||
// zero set size, 0 elsewhere.
|
||||
for (int t = threadIdx.x; t < B_X + casesPerBlock; t += B_X) {
|
||||
B[t] = t % setSize == 0;
|
||||
}
|
||||
|
||||
for (int l = 0; l < numOut / 32; ++l) { // Load 32 energies at a time for casesPerBlock cases
|
||||
__syncthreads();
|
||||
// Load energies into shmem
|
||||
for (int r = 0; r < casesPerBlock && blockCaseIdx + loadY + r < numCases; r += B_X / 32) {
|
||||
shE[(r + loadY) * (32 + 1) + loadX] = __expf(energies[r * numOut]);
|
||||
printf("%f\n", energies[r * numOut]);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Compute 32 rows of the lattice
|
||||
if (doWork) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
B[bIdx] = B[bIdx - 1] * shE[caseIdxInBlock * (32 + 1) + i] + B[bIdx];
|
||||
bLattice[i * numCases * setSize] = B[bIdx];
|
||||
// printf("thread %d wrote %d to idx %d\n", tidx, B[bIdx], bIdx);
|
||||
}
|
||||
}
|
||||
printf("thread %d made it\n", tidx);
|
||||
bLattice += 32 * numCases * setSize;
|
||||
}
|
||||
// if (numOut % 32 != 0) {
|
||||
// __syncthreads();
|
||||
//
|
||||
// }
|
||||
}
|
||||
|
||||
/*
|
||||
* energies: (numCases, numOut) (yes this is weird)
|
||||
* bLattice: (numOut + 1, setSize, numCases) (*out)
|
||||
*/
|
||||
void MSMBackward(NVMatrix& energies, NVMatrix& bLattice, int setSize) {
|
||||
int numCases = energies.getNumRows();
|
||||
int numOut = energies.getNumCols();
|
||||
|
||||
assert(!energies.isTrans());
|
||||
assert(!bLattice.isTrans());
|
||||
assert(energies.isContiguous());
|
||||
assert(energies.isContiguous());
|
||||
|
||||
bLattice.resize((numOut + 1) * setSize, numCases);
|
||||
int B_X = 32;
|
||||
int casesPerBlock = B_X / setSize;
|
||||
int shmem = 4*(B_X + casesPerBlock + casesPerBlock * (32 + 1));
|
||||
dim3 threads(B_X, 1);
|
||||
dim3 blocks(DIVUP(numCases*setSize, B_X), 1);
|
||||
printf("allocating %d words of shmem\n", shmem/4);
|
||||
cudaFuncSetCacheConfig(kMSMBackward<32>, cudaFuncCachePreferShared);
|
||||
kMSMBackward<32><<<blocks, threads, shmem>>>(energies.getDevData(), bLattice.getDevData(),
|
||||
numCases, numOut, setSize);
|
||||
getLastCudaError("kMSMBackward: Kernel execution failed");
|
||||
}
|
||||
|
||||
/*
|
||||
* E = sum(p_l * log(y_l))
|
||||
* probs: (numOut, numCases)
|
||||
* labels: (numOut, numCases)
|
||||
* maxProbs: (1, numCases)
|
||||
* labelLogProbs: (1, numCases) (*out)
|
||||
* correctProbs: (1, numCases) (*out)
|
||||
*
|
||||
* target: (1, numCases)
|
||||
*/
|
||||
__global__ void kCrossEntCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
|
||||
const int numCases, const int numOut) {
|
||||
const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
|
||||
|
||||
if (tx < numCases) {
|
||||
probs += tx;
|
||||
labels += tx;
|
||||
maxProbs += tx;
|
||||
labelLogProbs += tx;
|
||||
correctProbs += tx;
|
||||
|
||||
const float maxp = maxProbs[0];
|
||||
|
||||
/*
|
||||
* Compute the probability of guessing the correct case if you take the most-probable label.
|
||||
*
|
||||
* This is done like this:
|
||||
*
|
||||
* - If the most probable label is not equal to the true label, then the probability is zero.
|
||||
* - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
|
||||
*
|
||||
* This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
|
||||
* maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
|
||||
* Though it could never happen in reality. Well it could. But it wouldn't. Cool?
|
||||
*/
|
||||
float crossEnt = 0.0f;
|
||||
int numMax = 0;
|
||||
bool correctLabel = false;
|
||||
for (int i = 0; i < numOut; i++) {
|
||||
const float label_prob = labels[i * numCases];
|
||||
const float model_prob = probs[i * numCases];
|
||||
numMax += model_prob == maxp;
|
||||
crossEnt += label_prob * safelog(model_prob);
|
||||
correctLabel |= model_prob == maxp && label_prob > 0.0f;
|
||||
}
|
||||
labelLogProbs[0] = crossEnt;
|
||||
if (!correctLabel) {
|
||||
correctProbs[0] = 0.0f;
|
||||
} else {
|
||||
correctProbs[0] = 1.0f / float(numMax);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* E = sum(p_l * log(y_l))
|
||||
* y_l: (numOut, numCases)
|
||||
* labels: (numOut, numCases)
|
||||
*
|
||||
* dE_dy_l: (numOut, numCases)
|
||||
*/
|
||||
template <bool add>
|
||||
__global__ void kCrossEntGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
|
||||
const int numOut, const float gradCoeff) {
|
||||
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
|
||||
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
|
||||
const int tidx = ty * numCases + tx;
|
||||
|
||||
if (ty < numOut && tx < numCases) {
|
||||
const float label_prob = labels[tidx];
|
||||
const float model_prob = y_l[tidx];
|
||||
const float v = gradCoeff * __fdividef(label_prob, model_prob);
|
||||
if (add) {
|
||||
dE_dy_l[tidx] += v;
|
||||
} else {
|
||||
dE_dy_l[tidx] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* E = sum(p_l * log(y_l))
|
||||
* y_l: (numOut, numCases)
|
||||
* labels: (numOut, numCases)
|
||||
*
|
||||
* dE_dx_l: (numOut, numCases)
|
||||
*/
|
||||
template <bool add>
|
||||
__global__ void kCrossEntSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
|
||||
const int numOut, const float gradCoeff) {
|
||||
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
|
||||
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
|
||||
const int tidx = ty * numCases + tx;
|
||||
|
||||
if (ty < numOut && tx < numCases) {
|
||||
float v = 0;
|
||||
const float model_prob = y_l[tidx];
|
||||
for (int j = 0; j < numOut; j++) {
|
||||
const float label_prob = labels[j * numCases + tx];
|
||||
v += label_prob * ((j == ty) - model_prob);
|
||||
}
|
||||
v *= gradCoeff;
|
||||
if (add) {
|
||||
dE_dx_l[tidx] += v;
|
||||
} else {
|
||||
dE_dx_l[tidx] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* E = -log(y_t)
|
||||
* probs: (numOut, numCases)
|
||||
* labels: (1, numCases)
|
||||
* maxProbs: (1, numCases)
|
||||
* labelLogProbs: (1, numCases) (*out)
|
||||
* correctProbs: (1, numCases) (*out)
|
||||
*
|
||||
* target: (1, numCases)
|
||||
*/
|
||||
__global__ void kLogregCost(float* probs, float* labels, float* maxProbs, float* labelLogProbs, float* correctProbs,
|
||||
const int numCases, const int numOut) {
|
||||
const int tx = blockIdx.x * LOGREG_ERR_THREADS_X + threadIdx.x;
|
||||
|
||||
if (tx < numCases) {
|
||||
const int label = int(labels[tx]);
|
||||
const float maxp = maxProbs[tx];
|
||||
const float labelp = probs[label * numCases + tx];
|
||||
|
||||
labelLogProbs[tx] = __logf(labelp);
|
||||
|
||||
/*
|
||||
* Compute the probability of guessing the correct case if you take the most-probable label.
|
||||
*
|
||||
* This is done like this:
|
||||
*
|
||||
* - If the most probable label is not equal to the true label, then the probability is zero.
|
||||
* - Otherwise, the probability is 1 / (number of labels whose probability is equal to the maximum).
|
||||
*
|
||||
* This is certainly overkill -- in practice, it's just about impossible for two labels to get assigned
|
||||
* maximum probability. But it's a safety measure to prevent over-estimating your accuracy.
|
||||
* Though it could never happen in reality. Well it could. But it wouldn't. Cool?
|
||||
*/
|
||||
if (labelp != maxp) {
|
||||
correctProbs[tx] = 0;
|
||||
} else {
|
||||
int numMax = 0;
|
||||
for (int i = 0; i < numOut; i++) {
|
||||
numMax += probs[i * numCases + tx] == maxp;
|
||||
}
|
||||
correctProbs[tx] = 1.0f / float(numMax);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* E = -log(y_t)
|
||||
* y_l: (numOut, numCases)
|
||||
* labels: (1, numCases)
|
||||
*
|
||||
* dE_dy_l: (numOut, numCases)
|
||||
*/
|
||||
template <bool add>
|
||||
__global__ void kLogregCostGrad(float* y_l, float* labels, float* dE_dy_l, const int numCases,
|
||||
const int numOut, const float gradCoeff) {
|
||||
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
|
||||
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
|
||||
const int tidx = ty * numCases + tx;
|
||||
|
||||
if (ty < numOut && tx < numCases) {
|
||||
const int label = int(labels[tx]);
|
||||
float v = gradCoeff * (label == ty);
|
||||
v = __fdividef(v, y_l[tidx]);
|
||||
if (add) {
|
||||
dE_dy_l[tidx] += v;
|
||||
} else {
|
||||
dE_dy_l[tidx] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* E = -log(y_t)
|
||||
* y_l: (numOut, numCases)
|
||||
* labels: (1, numCases)
|
||||
*
|
||||
* dE_dx_l: (numOut, numCases)
|
||||
*/
|
||||
template <bool add>
|
||||
__global__ void kLogregSoftmaxGrad(float* y_l, float* labels, float* dE_dx_l, const int numCases,
|
||||
const int numOut, const float gradCoeff) {
|
||||
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
|
||||
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
|
||||
const int tidx = ty * numCases + tx;
|
||||
|
||||
if (ty < numOut && tx < numCases) {
|
||||
const int label = int(labels[tx]);
|
||||
float v = gradCoeff * ((label == ty) - y_l[tidx]);
|
||||
if (add) {
|
||||
dE_dx_l[tidx] += v;
|
||||
} else {
|
||||
dE_dx_l[tidx] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* dE_dy_l: (numOut, numCases)
|
||||
* y_l: (numOut, numCases)
|
||||
*
|
||||
* dE_dx_l: (numOut, numCases)
|
||||
*/
|
||||
template <bool add>
|
||||
__global__ void kSoftmaxGrad(float* dE_dy_l, float* y_l, float* dE_dx_l, const int numCases, const int numOut) {
|
||||
const int tx = blockIdx.x * LOGREG_GRAD_THREADS_X + threadIdx.x;
|
||||
const int ty = blockIdx.y * LOGREG_GRAD_THREADS_Y + threadIdx.y;
|
||||
const int tidx = ty * numCases + tx;
|
||||
|
||||
if (ty < numOut && tx < numCases) {
|
||||
float v = 0;
|
||||
for (int j = 0; j < numOut; j++) {
|
||||
v += dE_dy_l[j * numCases + tx] * ((j == ty) - y_l[j * numCases + tx]);
|
||||
}
|
||||
v *= y_l[tidx];
|
||||
|
||||
if (add) {
|
||||
dE_dx_l[tidx] += v;
|
||||
} else {
|
||||
dE_dx_l[tidx] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int B_X, bool add>
|
||||
__global__ void kEltwiseMaxGrad(float* actGrad, float* input, float* output, float* target,
|
||||
const int numElements) {
|
||||
for (int i = B_X * blockIdx.x + threadIdx.x; i < numElements; i += B_X * gridDim.x) {
|
||||
if (add) {
|
||||
target[i] += actGrad[i] * (output[i] == input[i]);
|
||||
} else {
|
||||
target[i] = actGrad[i] * (output[i] == input[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void computeEltwiseMaxGrad(NVMatrix& actGrad, NVMatrix& input, NVMatrix& output, NVMatrix& target, bool add) {
|
||||
assert(actGrad.isContiguous());
|
||||
assert(output.isContiguous());
|
||||
assert(input.isContiguous());
|
||||
assert(actGrad.isSameDims(input));
|
||||
assert(actGrad.isSameDims(output));
|
||||
|
||||
dim3 blocks(DIVUP(actGrad.getNumElements(), 128));
|
||||
dim3 threads(128);
|
||||
if (add) {
|
||||
assert(actGrad.isSameDims(target));
|
||||
cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, true>, cudaFuncCachePreferL1);
|
||||
kEltwiseMaxGrad<128, true><<<blocks, threads>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
|
||||
} else {
|
||||
target.resize(actGrad);
|
||||
cudaFuncSetCacheConfig(kEltwiseMaxGrad<128, false>, cudaFuncCachePreferL1);
|
||||
kEltwiseMaxGrad<128, false><<<blocks, threads>>>(actGrad.getDevData(), input.getDevData(), output.getDevData(), target.getDevData(), actGrad.getNumElements());
|
||||
}
|
||||
|
||||
getLastCudaError("computeEltwiseMaxGrad: Kernel execution failed");
|
||||
}
|
||||
|
||||
/*
|
||||
* E = sum_i{-p_i*log(y_i)}
|
||||
* probs: (numOut, numCases)
|
||||
* labels: (numOut, numCases)
|
||||
* maxProbs: (1, numCases)
|
||||
* labelLogProbs: (1, numCases) (*out)
|
||||
* correctProbs: (1, numCases) (*out)
|
||||
*
|
||||
* target: (1, numCases)
|
||||
*/
|
||||
void computeCrossEntCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
|
||||
int numCases = probs.getNumCols();
|
||||
int numOut = probs.getNumRows();
|
||||
|
||||
assert(labels.isSameDims(probs));
|
||||
assert(!labels.isTrans());
|
||||
assert(!probs.isTrans());
|
||||
assert(labels.isContiguous());
|
||||
assert(probs.isContiguous());
|
||||
|
||||
NVMatrix& maxProbs = probs.max(0);
|
||||
|
||||
labelLogProbs_out.resize(1, numCases);
|
||||
correctProbs_out.resize(1, numCases);
|
||||
dim3 threads(LOGREG_ERR_THREADS_X, 1);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
|
||||
cudaFuncSetCacheConfig(kCrossEntCost, cudaFuncCachePreferL1);
|
||||
kCrossEntCost<<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
|
||||
labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
|
||||
numCases, numOut);
|
||||
getLastCudaError("kCrossEntCost: Kernel execution failed");
|
||||
|
||||
delete &maxProbs;
|
||||
}
|
||||
|
||||
void computeCrossEntGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
|
||||
int numCases = probs.getLeadingDim();
|
||||
int numOut = probs.getFollowingDim();
|
||||
assert(labels.isSameDims(probs));
|
||||
assert(probs.isContiguous());
|
||||
assert(target.isContiguous());
|
||||
assert(labels.isContiguous());
|
||||
assert(!labels.isTrans());
|
||||
assert(!probs.isTrans());
|
||||
|
||||
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
|
||||
if (!add) {
|
||||
target.resize(probs);
|
||||
kCrossEntGrad<false><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
} else {
|
||||
kCrossEntGrad<true><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
}
|
||||
|
||||
getLastCudaError("kCrossEntGrad: Kernel execution failed");
|
||||
}
|
||||
|
||||
void computeSoftmaxGrad(NVMatrix& acts, NVMatrix& actsGrad, NVMatrix& target, bool add) {
|
||||
int numCases = acts.getLeadingDim();
|
||||
int numOut = acts.getFollowingDim();
|
||||
|
||||
assert(acts.isSameDims(actsGrad));
|
||||
assert(acts.isContiguous());
|
||||
assert(actsGrad.isContiguous());
|
||||
assert(target.isContiguous());
|
||||
assert(acts.isTrans());
|
||||
assert(actsGrad.isTrans());
|
||||
|
||||
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
|
||||
if (!add) {
|
||||
target.resize(acts);
|
||||
kSoftmaxGrad<false><<<blocks, threads>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut);
|
||||
} else {
|
||||
kSoftmaxGrad<true><<<blocks, threads>>>(actsGrad.getDevData(), acts.getDevData(), target.getDevData(), numCases, numOut);
|
||||
}
|
||||
getLastCudaError("computeSoftmaxGrad: Kernel execution failed");
|
||||
}
|
||||
|
||||
void computeCrossEntSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
|
||||
int numCases = probs.getLeadingDim();
|
||||
int numOut = probs.getFollowingDim();
|
||||
assert(labels.getLeadingDim() == probs.getLeadingDim() && labels.getFollowingDim() == probs.getFollowingDim());
|
||||
assert(probs.isContiguous());
|
||||
assert(target.isContiguous());
|
||||
assert(labels.isContiguous());
|
||||
assert(probs.isTrans());
|
||||
assert(!labels.isTrans());
|
||||
|
||||
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
|
||||
if (!add) {
|
||||
target.resize(probs);
|
||||
cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<false>, cudaFuncCachePreferL1);
|
||||
kCrossEntSoftmaxGrad<false><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
} else {
|
||||
cudaFuncSetCacheConfig(kCrossEntSoftmaxGrad<true>, cudaFuncCachePreferL1);
|
||||
kCrossEntSoftmaxGrad<true><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
}
|
||||
getLastCudaError("kCrossEntSoftmaxGrad: Kernel execution failed");
|
||||
}
|
||||
|
||||
/*
|
||||
* E = -log(y_t)
|
||||
* probs: (numOut, numCases)
|
||||
* labels: (1, numCases)
|
||||
* maxProbs: (1, numCases)
|
||||
* labelLogProbs: (1, numCases) (*out)
|
||||
* correctProbs: (1, numCases) (*out)
|
||||
*
|
||||
* target: (1, numCases) == log(y_l[labels,:]
|
||||
*/
|
||||
void computeLogregCost(NVMatrix& labels, NVMatrix& probs, NVMatrix& labelLogProbs_out, NVMatrix& correctProbs_out) {
|
||||
int numCases = probs.getNumCols();
|
||||
int numOut = probs.getNumRows();
|
||||
|
||||
assert(labels.getNumElements() == numCases);
|
||||
assert(!labels.isTrans());
|
||||
assert(!probs.isTrans());
|
||||
assert(labels.isContiguous());
|
||||
assert(probs.isContiguous());
|
||||
|
||||
NVMatrix& maxProbs = probs.max(0);
|
||||
|
||||
labelLogProbs_out.resize(1, numCases);
|
||||
correctProbs_out.resize(1, numCases);
|
||||
dim3 threads(LOGREG_ERR_THREADS_X, 1);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_ERR_THREADS_X), 1);
|
||||
cudaFuncSetCacheConfig(kLogregCost, cudaFuncCachePreferL1);
|
||||
kLogregCost<<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), maxProbs.getDevData(),
|
||||
labelLogProbs_out.getDevData(), correctProbs_out.getDevData(),
|
||||
numCases, numOut);
|
||||
getLastCudaError("computeLogregCost: Kernel execution failed");
|
||||
// cudaThreadSynchronize();
|
||||
delete &maxProbs;
|
||||
}
|
||||
|
||||
void computeLogregGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
|
||||
int numCases = probs.getLeadingDim();
|
||||
int numOut = probs.getFollowingDim();
|
||||
assert(labels.getNumElements() == numCases);
|
||||
assert(probs.isContiguous());
|
||||
assert(target.isContiguous());
|
||||
assert(labels.isContiguous());
|
||||
assert(!labels.isTrans());
|
||||
assert(!probs.isTrans());
|
||||
|
||||
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
|
||||
if (!add) {
|
||||
target.resize(probs);
|
||||
kLogregCostGrad<false><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
} else {
|
||||
kLogregCostGrad<true><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
}
|
||||
|
||||
getLastCudaError("computeLogregGrad: Kernel execution failed");
|
||||
}
|
||||
|
||||
void computeLogregSoftmaxGrad(NVMatrix& labels, NVMatrix& probs, NVMatrix& target, bool add, float coeff) {
|
||||
int numCases = probs.getLeadingDim();
|
||||
int numOut = probs.getFollowingDim();
|
||||
assert(labels.getNumElements() == numCases);
|
||||
assert(probs.isContiguous());
|
||||
assert(target.isContiguous());
|
||||
assert(labels.isContiguous());
|
||||
assert(probs.isTrans());
|
||||
|
||||
dim3 threads(LOGREG_GRAD_THREADS_X, LOGREG_GRAD_THREADS_Y);
|
||||
dim3 blocks(DIVUP(numCases, LOGREG_GRAD_THREADS_X), DIVUP(numOut, LOGREG_GRAD_THREADS_Y));
|
||||
if (!add) {
|
||||
target.resize(probs);
|
||||
kLogregSoftmaxGrad<false><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
} else {
|
||||
kLogregSoftmaxGrad<true><<<blocks, threads>>>(probs.getDevData(), labels.getDevData(), target.getDevData(),
|
||||
numCases, numOut, coeff);
|
||||
}
|
||||
|
||||
getLastCudaError("computeLogregSoftmaxGrad: Kernel execution failed");
|
||||
}
|
186
src/lr.cu
Normal file
186
src/lr.cu
Normal file
|
@ -0,0 +1,186 @@
|
|||
#include <string>
|
||||
#include <lr.cuh>
|
||||
#include <util.cuh>
|
||||
|
||||
using namespace std;
|
||||
|
||||
/*
|
||||
* ==================================
|
||||
* LearningRateSchedule
|
||||
* ==================================
|
||||
*/
|
||||
LearningRateSchedule& LearningRateSchedule::make(PyObject* lrsDict, double baseRate) {
|
||||
string type = pyDictGetString(lrsDict, "type");
|
||||
if (type == "default") {
|
||||
return *new LearningRateSchedule(baseRate, 0);
|
||||
} else {
|
||||
PyObject* paramsDict = PyDict_GetItemString(lrsDict, "params");
|
||||
double tgtFactor = pyDictGetFloat(paramsDict, "tgtFactor");
|
||||
double noiseStdev = pyDictGetFloat(paramsDict, "noiseStdev");
|
||||
if (type == "linear") {
|
||||
return *new LinearLRS(baseRate, tgtFactor, noiseStdev);
|
||||
} else if (type == "exp") {
|
||||
return *new ExpLRS(baseRate, tgtFactor, noiseStdev);
|
||||
} else if (type == "dexp") {
|
||||
double numSteps = pyDictGetInt(paramsDict, "numSteps");
|
||||
return *new DiscreteExpLRS(baseRate, tgtFactor, noiseStdev, numSteps);
|
||||
} else if (type == "jdexp") {
|
||||
double numSteps = pyDictGetInt(paramsDict, "numSteps");
|
||||
return *new JumpyDiscreteExpLRS(baseRate, tgtFactor, noiseStdev, numSteps);
|
||||
}
|
||||
}
|
||||
throw string("Unknown learning rate schedule type ") + type;
|
||||
}
|
||||
|
||||
LearningRateSchedule::LearningRateSchedule(double baseRate, double noiseStdev)
|
||||
: _baseRate(baseRate), _noiseStdev(noiseStdev), _haveRandnSpare(false), _randnSpare(0) {
|
||||
}
|
||||
|
||||
LearningRateSchedule::LearningRateSchedule(double baseRate)
|
||||
: _baseRate(baseRate), _noiseStdev(0), _haveRandnSpare(false), _randnSpare(0) {
|
||||
}
|
||||
|
||||
double LearningRateSchedule::getRate(double progress) {
|
||||
return _noiseStdev > 0 ? _getRate(progress) * (1 + abs(randn()) * _noiseStdev)
|
||||
: _getRate(progress);
|
||||
}
|
||||
|
||||
double LearningRateSchedule::_getRate(double progress) {
|
||||
return _baseRate;
|
||||
}
|
||||
|
||||
inline double LearningRateSchedule::randn() {
|
||||
if (!_haveRandnSpare) {
|
||||
double T = 2 * 3.1415 * rand();
|
||||
double R = std::sqrt(-2 * std::log(rand()));
|
||||
_randnSpare = R * std::sin(T);
|
||||
_haveRandnSpare = true;
|
||||
return R * std::cos(T);
|
||||
}
|
||||
_haveRandnSpare = false;
|
||||
return _randnSpare;
|
||||
}
|
||||
|
||||
// This should never generate zero
|
||||
inline double LearningRateSchedule::rand() const {
|
||||
return double(1L + random()) / (1L + RAND_MAX);
|
||||
}
|
||||
|
||||
inline double LearningRateSchedule::abs(double x) const {
|
||||
return x > 0 ? x : -x;
|
||||
}
|
||||
|
||||
double LearningRateSchedule::getBaseRate() const {
|
||||
return _baseRate;
|
||||
}
|
||||
|
||||
LearningRateSchedule::~LearningRateSchedule() {
|
||||
}
|
||||
|
||||
/*
|
||||
* ==================================
|
||||
* LinearLRS
|
||||
* ==================================
|
||||
*/
|
||||
LinearLRS::LinearLRS(double baseRate, double tgtFactor, double noiseStdev)
|
||||
: LearningRateSchedule(baseRate, noiseStdev) {
|
||||
_finalRate = baseRate / tgtFactor;
|
||||
}
|
||||
|
||||
double LinearLRS::_getRate(double progress) {
|
||||
return _baseRate * (1 - progress) + _finalRate * progress;
|
||||
}
|
||||
|
||||
/*
|
||||
* ==================================
|
||||
* ExpLRS
|
||||
* ==================================
|
||||
*/
|
||||
ExpLRS::ExpLRS(double baseRate, double tgtFactor, double noiseStdev)
|
||||
: LearningRateSchedule(baseRate, noiseStdev) {
|
||||
double finalRate = baseRate / tgtFactor;
|
||||
_pow = baseRate == 0 ? 1 : (std::log(finalRate) / std::log(baseRate) - 1);
|
||||
}
|
||||
|
||||
double ExpLRS::_getRate(double progress) {
|
||||
return std::pow(_baseRate, 1.0 + progress * _pow);
|
||||
}
|
||||
|
||||
/*
|
||||
* ==================================
|
||||
* TanhLRS
|
||||
* ==================================
|
||||
*/
|
||||
TanhLRS::TanhLRS(double baseRate, double tgtFactor, double noiseStdev)
|
||||
: LearningRateSchedule(baseRate, noiseStdev), _alpha(0), _beta(0) {
|
||||
if (baseRate > 0) {
|
||||
double finalRate = baseRate / tgtFactor;
|
||||
_beta = 0.5 * (baseRate + finalRate);
|
||||
_alpha = 2 * atanh((baseRate - finalRate) / (baseRate + finalRate));
|
||||
}
|
||||
}
|
||||
|
||||
double TanhLRS::_getRate(double progress) {
|
||||
return _beta * (tanh(-_alpha * (progress - 0.5)) + 1.0);
|
||||
}
|
||||
|
||||
/*
|
||||
* ==================================
|
||||
* DiscreteExpLRS
|
||||
* ==================================
|
||||
*/
|
||||
DiscreteExpLRS::DiscreteExpLRS(double baseRate, double tgtFactor, double noiseStdev, int numSteps)
|
||||
: LearningRateSchedule(baseRate, noiseStdev) {
|
||||
ExpLRS elrs(baseRate, tgtFactor, 0);
|
||||
double finalRate = baseRate / tgtFactor;
|
||||
for (int i = 0; i < numSteps - 1; i++) {
|
||||
double progress = double(i) / (numSteps - 1);
|
||||
_rates.push_back(elrs._getRate(progress));
|
||||
}
|
||||
_rates.push_back(finalRate);
|
||||
//printf("initialized base %e, final %e, stpes %d\n", baseRate, finalRate, numSteps);
|
||||
}
|
||||
|
||||
double DiscreteExpLRS::_getRate(double progress) {
|
||||
for (int i = 0; i < _rates.size(); ++i) {
|
||||
if (progress <= double(i + 1) / _rates.size()) {
|
||||
return _rates[i];
|
||||
}
|
||||
}
|
||||
return _rates.back();
|
||||
}
|
||||
|
||||
/*
|
||||
* ==================================
|
||||
* JumpyDiscreteExpLRS
|
||||
* ==================================
|
||||
*/
|
||||
JumpyDiscreteExpLRS::JumpyDiscreteExpLRS(double baseRate, double tgtFactor, double noiseStdev, int numSteps)
|
||||
: DiscreteExpLRS(baseRate, tgtFactor, noiseStdev, numSteps) {
|
||||
}
|
||||
|
||||
double JumpyDiscreteExpLRS::_getRate(double progress) {
|
||||
int rateIdx = 0;
|
||||
for (int i = 0; i < _rates.size(); ++i) {
|
||||
if (progress <= double(i + 1) / _rates.size()) {
|
||||
rateIdx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// The midpoint of the interval that progress falls into.
|
||||
double intervalMid = double(rateIdx + 0.5) / _rates.size();
|
||||
// Jumpy learning rate works like this:
|
||||
// If progress is before the midpoint of the current interval,
|
||||
// it returns the same learning rate as would DiscreteExpLRS.
|
||||
// Else,
|
||||
// it returns the learning rate of the *previous* interval (provided there is one).
|
||||
// rateIdx -= rateIdx > 0 && progress > 0.2 && progress < 0.9 && progress > intervalMid;
|
||||
|
||||
// Uncomment this (and comment line above) to use variant 2:
|
||||
// Instead of using the learning rate of the previous interval, this uses
|
||||
// the geometric average of the learning rates of the current and previous
|
||||
// intervals.
|
||||
bool jump = rateIdx > 0 && progress > 0.2 && progress < 0.9 && progress > intervalMid;
|
||||
return jump ? sqrt(_rates[rateIdx] * _rates[rateIdx - 1]) : _rates[rateIdx];
|
||||
// return _rates[rateIdx];
|
||||
}
|
126
src/multisoftmax.cpp
Normal file
126
src/multisoftmax.cpp
Normal file
|
@ -0,0 +1,126 @@
|
|||
|
||||
#include <assert.h>
|
||||
//#include <mathimf.h>
|
||||
#include <multisoftmax.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
// Computes log(exp(x) + exp(y))
|
||||
inline double logadd(const double x, const double y) {
|
||||
if (x <= -INF && y <= -INF) {
|
||||
return -INF;
|
||||
}
|
||||
const double M = max(x,y);
|
||||
const double m = min(x,y);
|
||||
const double diff = M - m;
|
||||
// return diff > 15 ? M : M + LOG(1.0 + EXP(-diff));
|
||||
// return m <= -INF ? M : M + LOG(1.0f + EXP(-diff));
|
||||
return diff > 15 ? M : (diff > 5 ? M + EXP(-diff) : M + LOG(1.0 + EXP(-diff)));
|
||||
}
|
||||
|
||||
/*
|
||||
* elts: (numCases, numOut)
|
||||
* B: (N + 1, size + 1) -- batckward lattice matrix, MUST BE initially -INF
|
||||
* fixed: (numCases, 1)
|
||||
* probs: (numCases, numOut) (*out)
|
||||
*
|
||||
* double precision is much faster than single. :/
|
||||
*/
|
||||
void MultiSoftmaxCPU_T_logspace(Matrix& elts, Matrix& logB, Matrix& probs, Matrix& fixed, int size, bool nofix) {
|
||||
int numCases = elts.getNumRows();
|
||||
assert(probs.isSameDims(elts));
|
||||
assert(!elts.isTrans());
|
||||
assert(!logB.isTrans());
|
||||
assert(!probs.isTrans());
|
||||
assert(fixed.getNumRows() == numCases);
|
||||
assert(fixed.getNumCols() == 1);
|
||||
int N = elts.getNumCols();
|
||||
Matrix& logF = *new Matrix(size + 1, 1); // Forward column
|
||||
|
||||
// Prepare logB
|
||||
logB(N, 0) = 0;
|
||||
|
||||
for (int c = 0; c < numCases; ++c) {
|
||||
int fx = nofix ? -1 : int(fixed(c, 0));
|
||||
// Backward pass
|
||||
for (int i = N - 1; i >= 0; --i) {
|
||||
double elt = elts(c, i);
|
||||
logB(i, 0) = i <= fx ? -INF : 0.0f;
|
||||
for (int s = max(1, size - i); s < size + 1; ++s) {
|
||||
logB(i, s) = fx == i ? logB(i + 1, s - 1) + elt : logadd(logB(i + 1, s - 1) + elt, logB(i + 1, s));
|
||||
}
|
||||
}
|
||||
// Log partition function
|
||||
double logZ = logB(0, size);
|
||||
|
||||
// Forward pass
|
||||
logF.apply(Matrix::ONE);
|
||||
logF.scale(-INF);
|
||||
logF(0, 0) = 0;
|
||||
|
||||
for (int i = 1; i < N + 1; ++i) {
|
||||
double logy = -INF;
|
||||
double elt = elts(c, i - 1);
|
||||
for (int s = size; s >= 0; --s) {
|
||||
if (s < size) {
|
||||
logy = logadd(logy, logF(s, 0) + logB(i, size - 1 - s));
|
||||
}
|
||||
if (s > 0) {
|
||||
logF(s, 0) = fx == i - 1 ? logF(s - 1, 0) + elt : logadd(logF(s - 1, 0) + elt, logF(s, 0));
|
||||
} else if (fx == i - 1) {
|
||||
logF(0, 0) = -INF;
|
||||
}
|
||||
}
|
||||
logy += elt - logZ;
|
||||
probs(c, i - 1) = EXP(logy) - (fx >= 0 ? probs(c, i - 1) : 0);
|
||||
}
|
||||
}
|
||||
|
||||
delete &logF;
|
||||
}
|
||||
|
||||
MultiSoftmaxWorker::MultiSoftmaxWorker(Matrix* elts, Matrix* B, Matrix* probs, Matrix* fixed, int size, bool nofix)
|
||||
: Thread(true), _elts(elts), _B(B), _probs(probs), _fixed(fixed), _size(size), _nofix(nofix) {
|
||||
|
||||
}
|
||||
|
||||
MultiSoftmaxWorker::~MultiSoftmaxWorker() {
|
||||
delete _elts;
|
||||
delete _probs;
|
||||
delete _fixed;
|
||||
}
|
||||
|
||||
void* MultiSoftmaxWorker::run() {
|
||||
MultiSoftmaxCPU_T_logspace(*_elts, *_B, *_probs, *_fixed, _size, _nofix);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* elts: (numCases, numOut)
|
||||
* B: vector of (N + 1, size + 1) -- batckward lattice matrix, should be initially zero
|
||||
* fixed: (numCases, 1)
|
||||
* probs: (numCases, numOut) (*out)
|
||||
*
|
||||
* NOTE: remember to write a version of this for transposed matrices.
|
||||
* It may end up being significantly faster, which is important if
|
||||
* I plan to use CPU for this.
|
||||
*/
|
||||
void MultiSoftmaxCPU_T_parallel(Matrix& elts, vector<Matrix*>& B, Matrix& probs, Matrix& fixed, int size, bool nofix) {
|
||||
int numCases = elts.getNumRows();
|
||||
int numWorkers = min(numCases, (int)B.size());
|
||||
probs.resize(elts);
|
||||
int casesPerWorker = DIVUP(numCases, B.size());
|
||||
numWorkers = min(numWorkers, DIVUP(numCases, casesPerWorker));
|
||||
vector<Thread*> workers;
|
||||
for (int i = 0; i < numWorkers; ++i) {
|
||||
Matrix* eltSlice = &elts.sliceRows(i * casesPerWorker, min(elts.getNumRows(), (long int)(i + 1) * casesPerWorker));
|
||||
Matrix* probSlice = &probs.sliceRows(i * casesPerWorker, min(elts.getNumRows(), (long int)(i + 1) * casesPerWorker));
|
||||
Matrix* fixedSlice = &fixed.sliceRows(i * casesPerWorker, min(elts.getNumRows(), (long int)(i + 1) * casesPerWorker));
|
||||
workers.push_back(new MultiSoftmaxWorker(eltSlice, B[i], probSlice, fixedSlice, size, nofix));
|
||||
workers[i]->start();
|
||||
}
|
||||
for (int i = 0; i < numWorkers; ++i) {
|
||||
workers[i]->join();
|
||||
delete workers[i];
|
||||
}
|
||||
}
|
85
src/neuron.cu
Normal file
85
src/neuron.cu
Normal file
|
@ -0,0 +1,85 @@
|
|||
/*
|
||||
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <neuron.cuh>
|
||||
#include <util.cuh>
|
||||
|
||||
using namespace std;
|
||||
|
||||
Neuron& Neuron::makeNeuron(PyObject* neuronDict) {
|
||||
string type = pyDictGetString(neuronDict, "type");
|
||||
PyObject* neuronParamsDict = PyDict_GetItemString(neuronDict, "params");
|
||||
|
||||
if (type == "relu") {
|
||||
return *new ReluNeuron();
|
||||
}
|
||||
|
||||
if (type == "nrelu") {
|
||||
return *new NoisyReluNeuron();
|
||||
}
|
||||
|
||||
if (type == "drelu") {
|
||||
return *new DoubleReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
|
||||
}
|
||||
|
||||
if (type == "softrelu") {
|
||||
return *new SoftReluNeuron();
|
||||
}
|
||||
|
||||
if (type == "brelu") {
|
||||
return *new BoundedReluNeuron(pyDictGetFloat(neuronParamsDict, "a"));
|
||||
}
|
||||
|
||||
if (type == "abs") {
|
||||
return *new AbsNeuron();
|
||||
}
|
||||
|
||||
if (type == "logistic") {
|
||||
return *new LogisticNeuron();
|
||||
}
|
||||
|
||||
if (type == "tanh") {
|
||||
return *new TanhNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
|
||||
}
|
||||
|
||||
if (type == "square") {
|
||||
return *new SquareNeuron();
|
||||
}
|
||||
|
||||
if (type == "sqrt") {
|
||||
return *new SqrtNeuron();
|
||||
}
|
||||
|
||||
if (type == "linear") {
|
||||
return *new LinearNeuron(pyDictGetFloat(neuronParamsDict, "a"), pyDictGetFloat(neuronParamsDict, "b"));
|
||||
}
|
||||
|
||||
if (type == "ident") {
|
||||
return *new Neuron();
|
||||
}
|
||||
|
||||
throw string("Unknown neuron type: ") + type;
|
||||
}
|
242
src/pyconvnet.cu
Normal file
242
src/pyconvnet.cu
Normal file
|
@ -0,0 +1,242 @@
|
|||
/*
|
||||
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <Python.h>
|
||||
#include <arrayobject.h>
|
||||
#include <assert.h>
|
||||
#include <helper_cuda.h>
|
||||
#include <cublas.h>
|
||||
#include <time.h>
|
||||
#include <vector>
|
||||
|
||||
#include <matrix.h>
|
||||
#include <queue.h>
|
||||
#include <worker.cuh>
|
||||
#include <util.cuh>
|
||||
#include <cost.cuh>
|
||||
|
||||
#include <pyconvnet.cuh>
|
||||
#include <convnet.cuh>
|
||||
|
||||
using namespace std;
|
||||
static ConvNet* model = NULL;
|
||||
|
||||
static PyMethodDef _ConvNetMethods[] = {{ "initModel", initModel, METH_VARARGS },
|
||||
{ "startBatch", startBatch, METH_VARARGS },
|
||||
{ "finishBatch", finishBatch, METH_VARARGS },
|
||||
{ "checkGradients", checkGradients, METH_VARARGS },
|
||||
{ "startMultiviewTest", startMultiviewTest, METH_VARARGS },
|
||||
{ "startFeatureWriter", startFeatureWriter, METH_VARARGS },
|
||||
{ "startDataGrad", startDataGrad, METH_VARARGS },
|
||||
{ "syncWithHost", syncWithHost, METH_VARARGS },
|
||||
{ NULL, NULL }
|
||||
};
|
||||
|
||||
#if defined(_WIN64) || defined(_WIN32)
|
||||
extern "C" __declspec(dllexport) void initpyconvnet() {
|
||||
(void) Py_InitModule("pyconvnet", _ConvNetMethods);
|
||||
import_array();
|
||||
}
|
||||
#else
|
||||
void INITNAME() {
|
||||
(void) Py_InitModule(QUOTEME(MODELNAME), _ConvNetMethods);
|
||||
import_array();
|
||||
}
|
||||
#endif
|
||||
|
||||
PyObject* initModel(PyObject *self, PyObject *args) {
|
||||
assert(model == NULL);
|
||||
|
||||
PyDictObject* pyLayerParams;
|
||||
PyListObject* pyDeviceIDs, *pyDeviceCPUs;
|
||||
int pyMinibatchSize;
|
||||
int pyWeightUpdateFreq;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O!O!O!ii",
|
||||
&PyDict_Type, &pyLayerParams,
|
||||
&PyList_Type, &pyDeviceIDs,
|
||||
&PyList_Type, &pyDeviceCPUs,
|
||||
&pyMinibatchSize,
|
||||
&pyWeightUpdateFreq)) {
|
||||
return NULL;
|
||||
}
|
||||
intv& deviceIDs = *getIntV((PyObject*)pyDeviceIDs);
|
||||
vector<intv*>& deviceCPUs = *new vector<intv*>();
|
||||
for (int i = 0; i < PyList_GET_SIZE(pyDeviceCPUs); i++) {
|
||||
intv* v = getIntV(PyList_GetItem((PyObject*)pyDeviceCPUs, i));
|
||||
deviceCPUs.push_back(v);
|
||||
}
|
||||
model = new ConvNet((PyObject*)pyLayerParams,
|
||||
deviceIDs,
|
||||
deviceCPUs,
|
||||
pyMinibatchSize,
|
||||
pyWeightUpdateFreq);
|
||||
|
||||
model->start();
|
||||
return Py_BuildValue("i", 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Starts training/testing on the given batch (asynchronous -- returns immediately).
|
||||
*/
|
||||
PyObject* startBatch(PyObject *self, PyObject *args) {
|
||||
assert(model != NULL);
|
||||
PyListObject* data;
|
||||
double progress;
|
||||
int test = 0;
|
||||
if (!PyArg_ParseTuple(args, "O!d|i",
|
||||
&PyList_Type, &data,
|
||||
&progress,
|
||||
&test)) {
|
||||
return NULL;
|
||||
}
|
||||
CPUData* cpuData = new CPUData((PyObject*)data);
|
||||
|
||||
TrainingWorker* wr = new TrainingWorker(*model, *cpuData, progress, test);
|
||||
model->getWorkerQueue().enqueue(wr);
|
||||
return Py_BuildValue("i", 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Starts testing on the given batch (asynchronous -- returns immediately).
|
||||
*/
|
||||
PyObject* startMultiviewTest(PyObject *self, PyObject *args) {
|
||||
assert(model != NULL);
|
||||
PyListObject* data;
|
||||
int numViews;
|
||||
PyArrayObject* pyProbs = NULL;
|
||||
char* logregName = NULL;
|
||||
if (!PyArg_ParseTuple(args, "O!i|O!s",
|
||||
&PyList_Type, &data,
|
||||
&numViews,
|
||||
&PyArray_Type, &pyProbs,
|
||||
&logregName)) {
|
||||
return NULL;
|
||||
}
|
||||
CPUData* cpuData = new CPUData((PyObject*)data);
|
||||
MultiviewTestWorker* wr = pyProbs == NULL ? new MultiviewTestWorker(*model, *cpuData, numViews)
|
||||
: new MultiviewTestWorker(*model, *cpuData, numViews, *new Matrix(pyProbs), logregName);
|
||||
model->getWorkerQueue().enqueue(wr);
|
||||
return Py_BuildValue("i", 0);
|
||||
}
|
||||
|
||||
PyObject* startFeatureWriter(PyObject *self, PyObject *args) {
|
||||
assert(model != NULL);
|
||||
PyListObject* data;
|
||||
PyListObject* pyFtrs;
|
||||
PyListObject* pyLayerNames;
|
||||
if (!PyArg_ParseTuple(args, "O!O!O!",
|
||||
&PyList_Type, &data,
|
||||
&PyList_Type, &pyFtrs,
|
||||
&PyList_Type, &pyLayerNames)) {
|
||||
return NULL;
|
||||
}
|
||||
stringv* layerNames = getStringV((PyObject*)pyLayerNames);
|
||||
CPUData* cpuData = new CPUData((PyObject*)data);
|
||||
MatrixV* ftrs = getMatrixV((PyObject*)pyFtrs);
|
||||
|
||||
FeatureWorker* wr = new FeatureWorker(*model, *cpuData, *ftrs, *layerNames);
|
||||
model->getWorkerQueue().enqueue(wr);
|
||||
return Py_BuildValue("i", 0);
|
||||
}
|
||||
|
||||
PyObject* startDataGrad(PyObject *self, PyObject *args) {
|
||||
// assert(model != NULL);
|
||||
// PyListObject* data;
|
||||
// int dataLayerIdx, softmaxLayerIdx;
|
||||
// if (!PyArg_ParseTuple(args, "O!ii",
|
||||
// &PyList_Type, &data,
|
||||
// &dataLayerIdx, &softmaxLayerIdx)) {
|
||||
// return NULL;
|
||||
// }
|
||||
// CPUData* cpuData = new CPUData((PyObject*)data);
|
||||
// Matrix& ftrs = *mvec.back();
|
||||
// mvec.pop_back();
|
||||
//
|
||||
// DataGradWorker* wr = new DataGradWorker(*model, *cpuData, ftrs, dataLayerIdx, softmaxLayerIdx);
|
||||
// model->getWorkerQueue().enqueue(wr);
|
||||
return Py_BuildValue("i", 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Waits for the trainer to finish training on the batch given to startBatch.
|
||||
*/
|
||||
PyObject* finishBatch(PyObject *self, PyObject *args) {
|
||||
assert(model != NULL);
|
||||
WorkResult* res = model->getResultQueue().dequeue();
|
||||
assert(res != NULL);
|
||||
assert(res->getResultType() == WorkResult::BATCH_DONE);
|
||||
|
||||
Cost& cost = res->getResults();
|
||||
PyObject* dict = PyDict_New();
|
||||
CostMap& costMap = cost.getCostMap();
|
||||
for (CostMap::const_iterator it = costMap.begin(); it != costMap.end(); ++it) {
|
||||
PyObject* v = PyList_New(0);
|
||||
for (vector<double>::const_iterator iv = it->second->begin(); iv != it->second->end(); ++iv) {
|
||||
PyObject* f = PyFloat_FromDouble(*iv);
|
||||
PyList_Append(v, f);
|
||||
}
|
||||
PyDict_SetItemString(dict, it->first.c_str(), v);
|
||||
}
|
||||
|
||||
PyObject* retVal = Py_BuildValue("Ni", dict, cost.getNumCases());
|
||||
delete res; // Deletes cost too
|
||||
return retVal;
|
||||
}
|
||||
|
||||
PyObject* checkGradients(PyObject *self, PyObject *args) {
|
||||
assert(model != NULL);
|
||||
PyListObject* data;
|
||||
if (!PyArg_ParseTuple(args, "O!",
|
||||
&PyList_Type, &data)) {
|
||||
return NULL;
|
||||
}
|
||||
CPUData* cpuData = new CPUData((PyObject*)data);
|
||||
|
||||
GradCheckWorker* wr = new GradCheckWorker(*model, *cpuData);
|
||||
model->getWorkerQueue().enqueue(wr);
|
||||
WorkResult* res = model->getResultQueue().dequeue();
|
||||
assert(res != NULL);
|
||||
assert(res->getResultType() == WorkResult::BATCH_DONE);
|
||||
delete res;
|
||||
return Py_BuildValue("i", 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Copies weight matrices from GPU to system memory.
|
||||
*/
|
||||
PyObject* syncWithHost(PyObject *self, PyObject *args) {
|
||||
assert(model != NULL);
|
||||
SyncWorker* wr = new SyncWorker(*model);
|
||||
model->getWorkerQueue().enqueue(wr);
|
||||
WorkResult* res = model->getResultQueue().dequeue();
|
||||
assert(res != NULL);
|
||||
assert(res->getResultType() == WorkResult::SYNC_DONE);
|
||||
|
||||
delete res;
|
||||
return Py_BuildValue("i", 0);
|
||||
}
|
||||
|
65
src/quantizer.cu
Normal file
65
src/quantizer.cu
Normal file
|
@ -0,0 +1,65 @@
|
|||
#include <quantizer.cuh>
|
||||
|
||||
using namespace std;
|
||||
|
||||
/*=================
|
||||
* Quantizer
|
||||
* ================
|
||||
*/
|
||||
|
||||
Quantizer& Quantizer::make(PyObject* lrsDict) {
|
||||
string type = pyDictGetString(lrsDict, "type");
|
||||
if (type == "default") {
|
||||
return *new Quantizer();
|
||||
} else if (type == "half") {
|
||||
return *new HalfQuantizer();
|
||||
}
|
||||
throw string("Unknown quantizer type ") + type;
|
||||
}
|
||||
|
||||
Quantizer::Quantizer() : _numRows(0), _numCols(0), _trans(false) {
|
||||
}
|
||||
|
||||
Quantizer::~Quantizer() {
|
||||
}
|
||||
|
||||
void Quantizer::quantize(NVMatrix& src, NVMatrix& tgt) {
|
||||
_quantize(src, tgt);
|
||||
_quantized = &tgt;
|
||||
_numRows = src.getNumRows();
|
||||
_numCols = src.getNumCols();
|
||||
_trans = src.isTrans();
|
||||
}
|
||||
|
||||
void Quantizer::dequantize(NVMatrix& tgt, float scaleTarget, float scaleOutput) {
|
||||
_dequantize(tgt, scaleTarget, scaleOutput);
|
||||
tgt.setTrans(_trans);
|
||||
tgt.reshape(_numRows, _numCols);
|
||||
}
|
||||
|
||||
void Quantizer::dequantize(NVMatrix& tgt) {
|
||||
dequantize(tgt, 0, 1);
|
||||
}
|
||||
|
||||
void Quantizer::_quantize(NVMatrix& src, NVMatrix& tgt) {
|
||||
src.copy(tgt);
|
||||
}
|
||||
|
||||
void Quantizer::_dequantize(NVMatrix& tgt, float scaleTarget, float scaleOutput) {
|
||||
tgt.add(*_quantized, scaleTarget, scaleOutput);
|
||||
}
|
||||
|
||||
/*=================
|
||||
* HalfQuantizer
|
||||
* ================
|
||||
*/
|
||||
HalfQuantizer::HalfQuantizer() : Quantizer() {
|
||||
}
|
||||
|
||||
void HalfQuantizer::_quantize(NVMatrix& src, NVMatrix& tgt) {
|
||||
convQuantizeHalf(src, tgt);
|
||||
}
|
||||
|
||||
void HalfQuantizer::_dequantize(NVMatrix& tgt, float scaleTarget, float scaleOutput) {
|
||||
convDequantizeHalf(*_quantized, tgt, _numRows * _numCols, scaleTarget, scaleOutput);
|
||||
}
|
441
src/softmaxtree.cu
Normal file
441
src/softmaxtree.cu
Normal file
|
@ -0,0 +1,441 @@
|
|||
#include <softmaxtree.cuh>
|
||||
|
||||
#include "layer.cuh"
|
||||
|
||||
using namespace std;
|
||||
|
||||
/*
|
||||
* This launches a series of blocks for every node at a given depth.
|
||||
* The "series" just spans the length of the weight vectors.
|
||||
*
|
||||
* The operation performed is (loosely):
|
||||
* targets[d] := weights[d] + targets[d-1]
|
||||
*
|
||||
* Block size: (y, x) = (1, B_X)
|
||||
* Grid size: (y, x) = (numNodesAtDepth, ceil(numFeatures/B_X))
|
||||
*
|
||||
* weights: (numNodes, numFeatures)
|
||||
* nodes: numNodesAtDepth-length array of ushort2
|
||||
* where x coordinate gives node idx and y coordinate gives parent idx
|
||||
* targets: (numNodes, numFeatures)
|
||||
*
|
||||
*/
|
||||
template<int B_X,bool root>
|
||||
__global__ void kSoftmaxTreeFwd(float* weights, ushort2* nodes, float* targets, const int numFeatures) {
|
||||
__shared__ ushort2 node; // node.x == node idx, node.y == parent node idx
|
||||
const int depthNodeIdx = blockIdx.y;
|
||||
const int featureOffset = blockIdx.x * B_X + threadIdx.x;
|
||||
if (threadIdx.x == 0) {
|
||||
node = nodes[depthNodeIdx];
|
||||
}
|
||||
__syncthreads();
|
||||
weights += featureOffset;
|
||||
targets += featureOffset;
|
||||
// No loops for now
|
||||
if (featureOffset < numFeatures) {
|
||||
if (root) {
|
||||
targets[node.x * numFeatures] = weights[numFeatures * node.x];
|
||||
} else {
|
||||
targets[node.x * numFeatures] = targets[node.y * numFeatures] + weights[numFeatures * node.x];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This launches a series of blocks for every node at a given height.
|
||||
* The "series" just spans the length of the weight vectors.
|
||||
*
|
||||
* The operation performed is (loosely):
|
||||
* grads[h] := sum_d{grads[h-1]}
|
||||
*
|
||||
* Block size: (y, x) = (1, B_X)
|
||||
* Grid size: (y, x) = (numNodesAtHeight, ceil(numFeatures/B_X))
|
||||
*
|
||||
* grads: (numNodes, numFeatures)
|
||||
* nodes: numNodesAtHeight-length array of ushort2
|
||||
* where x coordinate gives node idx and y coordinate gives NUMBER OF CHILDREN
|
||||
* ^ (note difference with kSoftmaxTreeFwd)
|
||||
* childrenPtrs: numNodesAtHeight-length array of pointers to children indices
|
||||
*
|
||||
* The idea is to start one of these grids at each height, in sequence, starting
|
||||
* from height = 1.
|
||||
*
|
||||
* The rows 0-numLabels-1 of grads must already have the correct softmax gradients (these
|
||||
* are the nodes at height = 0).
|
||||
*
|
||||
*/
|
||||
template<int B_X>
|
||||
__global__ void kSoftmaxTreeBwd(float* grads, ushort2* nodes, ushort** childrenPtrs, const int numFeatures) {
|
||||
__shared__ ushort2 node; // node.x == node idx, node.y == parent node idx
|
||||
__shared__ ushort* childrenPtr;
|
||||
__shared__ ushort children[B_X];
|
||||
const int heightNodeIdx = blockIdx.y;
|
||||
const int featureOffset = blockIdx.x * B_X + threadIdx.x;
|
||||
if (threadIdx.x == 0) {
|
||||
node = nodes[heightNodeIdx];
|
||||
childrenPtr = childrenPtrs[heightNodeIdx];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
grads += featureOffset;
|
||||
const int nodeIdx = node.x;
|
||||
const int numChildren = node.y;
|
||||
|
||||
float nodeGrad = 0;
|
||||
for (int c = 0; c < numChildren; c += B_X) {
|
||||
|
||||
if (c + threadIdx.x < numChildren) {
|
||||
children[threadIdx.x] = childrenPtr[c + threadIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
if (featureOffset < numFeatures) {
|
||||
const int numChildrenLeft = min(B_X, numChildren - c);
|
||||
for (int cc = 0; cc < numChildrenLeft; ++cc) {
|
||||
const int childIdx = children[cc];
|
||||
//const int childIdx = childrenPtr[c + cc];
|
||||
nodeGrad += grads[childIdx * numFeatures];
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
if (featureOffset < numFeatures) {
|
||||
grads[nodeIdx * numFeatures] = nodeGrad;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
*
|
||||
* Block size: (y, x) = (1, B_X)
|
||||
* Grid size: (y, x) = (1, numNodes)
|
||||
*
|
||||
* weights: (numNodes, numFeatures)
|
||||
* weightsInc: (numNodes, numFeatures)
|
||||
* weightsGrad: (numNodes, numFeatures)
|
||||
* nodeSizes: numNodes-array whose ith element gives number of leaves under
|
||||
* node with label i.
|
||||
*
|
||||
* TODO: why did I make nodeSizes ushort? int would prolly be fine.
|
||||
*/
|
||||
template<int B_X>
|
||||
__global__ void kSoftmaxTreeUpdateWeights(float* weights, float* weightsInc, float* weightsGrad,
|
||||
ushort* nodeSizes, const int numFeatures,
|
||||
float eps, const float mom, float wc) {
|
||||
__shared__ int nodeSize; // node.x == node idx, node.y == parent node idx
|
||||
const int nodeIdx = blockIdx.x;
|
||||
if (threadIdx.x == 0) {
|
||||
nodeSize = nodeSizes[nodeIdx];
|
||||
}
|
||||
__syncthreads();
|
||||
weights += nodeIdx * numFeatures;
|
||||
weightsInc += nodeIdx * numFeatures;
|
||||
weightsGrad += nodeIdx * numFeatures;
|
||||
|
||||
// TODO: make these shared?
|
||||
// eps *= sqrtf(nodeSize);
|
||||
wc /= nodeSize;
|
||||
eps /= nodeSize; // larger epsw at the leaves
|
||||
|
||||
for (int f = threadIdx.x; f < numFeatures; f += B_X) {
|
||||
const float inc = mom * weightsInc[f] + eps * (weightsGrad[f] - wc * weights[f]);
|
||||
weightsInc[f] = inc;
|
||||
weights[f] += inc;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* ==================
|
||||
* SoftmaxNode
|
||||
* ==================
|
||||
*/
|
||||
int SoftmaxNode::setDistances(std::map<int, SoftmaxNodeV*>& nodeHeights,
|
||||
std::map<int, SoftmaxNodeV*>& nodeDepths) {
|
||||
_height = 0;
|
||||
for (SoftmaxNodeV::iterator it = _children.begin(); it != _children.end(); ++it) {
|
||||
_height = max(_height, (*it)->setDistances(nodeHeights, nodeDepths));
|
||||
}
|
||||
_height += _children.size() > 0;
|
||||
if (nodeHeights.count(_height) == 0) {
|
||||
nodeHeights[_height] = new SoftmaxNodeV();
|
||||
}
|
||||
if (nodeDepths.count(_depth) == 0) {
|
||||
nodeDepths[_depth] = new SoftmaxNodeV();
|
||||
}
|
||||
|
||||
nodeHeights[_height]->push_back(this);
|
||||
nodeDepths[_depth]->push_back(this);
|
||||
return _height;
|
||||
}
|
||||
|
||||
void SoftmaxNode::setNodeCounts(int &nodes, int& leaves) {
|
||||
nodes++;
|
||||
leaves += _children.size() == 0;
|
||||
for (SoftmaxNodeV::iterator it = _children.begin(); it != _children.end(); ++it) {
|
||||
(*it)->setNodeCounts(nodes, leaves);
|
||||
}
|
||||
}
|
||||
|
||||
int SoftmaxNode::setSizes(ushort* nodeSizes) {
|
||||
_size = _children.size() == 0;
|
||||
for (SoftmaxNodeV::iterator it = _children.begin(); it != _children.end(); ++it) {
|
||||
_size += (*it)->setSizes(nodeSizes);
|
||||
}
|
||||
nodeSizes[_label] = _size;
|
||||
return _size;
|
||||
}
|
||||
|
||||
SoftmaxNode::SoftmaxNode(SoftmaxNode* parent, int label)
|
||||
: _parent(parent), _label(label), _size(0), _height(0) {
|
||||
_depth = parent == NULL ? 0 : parent->getDepth() + 1;
|
||||
}
|
||||
|
||||
SoftmaxNode::~SoftmaxNode() {
|
||||
for (SoftmaxNodeV::iterator it = _children.begin(); it != _children.end(); ++it) {
|
||||
delete *it;
|
||||
}
|
||||
}
|
||||
|
||||
int SoftmaxNode::getDepth() const {
|
||||
return _depth;
|
||||
}
|
||||
|
||||
int SoftmaxNode::getHeight() const {
|
||||
return _height;
|
||||
}
|
||||
|
||||
int SoftmaxNode::getSize() const {
|
||||
return _size;
|
||||
}
|
||||
|
||||
int SoftmaxNode::getLabel() const {
|
||||
return _label;
|
||||
}
|
||||
|
||||
SoftmaxNode* SoftmaxNode::getParent() {
|
||||
return _parent;
|
||||
}
|
||||
|
||||
SoftmaxNodeV& SoftmaxNode::getChildren() {
|
||||
return _children;
|
||||
}
|
||||
|
||||
SoftmaxNode& SoftmaxNode::addChild(int label) {
|
||||
_children.push_back(new SoftmaxNode(this, label));
|
||||
return *_children.back();
|
||||
}
|
||||
|
||||
/*
|
||||
* ==================
|
||||
* SoftmaxTree
|
||||
* ==================
|
||||
*/
|
||||
SoftmaxTree::SoftmaxTree(int rootLabel) {
|
||||
_root = new SoftmaxNode(NULL, rootLabel);
|
||||
_nodeSizes = NULL;
|
||||
_numNodes = 0;
|
||||
_numLeaves = 0;
|
||||
}
|
||||
|
||||
SoftmaxTree::~SoftmaxTree() {
|
||||
checkCudaErrors(cudaFreeHost(_nodeSizes));
|
||||
|
||||
for (map<int, SoftmaxNodeV*>::iterator it = _nodeHeights.begin(); it != _nodeHeights.end(); ++it) {
|
||||
int height = it->first;
|
||||
SoftmaxNodeV& nodes = *it->second;
|
||||
for (int n = 0; n < nodes.size(); n++) {
|
||||
checkCudaErrors(cudaFreeHost(_nodeChildMeta[height][n]));
|
||||
}
|
||||
checkCudaErrors(cudaFreeHost(_nodeChildMeta[height]));
|
||||
checkCudaErrors(cudaFreeHost(_nodeChildMeta[height]));
|
||||
delete &nodes;
|
||||
}
|
||||
for (map<int, SoftmaxNodeV*>::iterator it = _nodeDepths.begin(); it != _nodeDepths.end(); ++it) {
|
||||
SoftmaxNodeV& nodes = *it->second;
|
||||
int depth = it->first;
|
||||
checkCudaErrors(cudaFreeHost(_nodeFwdMeta[depth]));
|
||||
delete &nodes;
|
||||
}
|
||||
|
||||
delete _root;
|
||||
}
|
||||
|
||||
void SoftmaxTree::setFwdMeta() {
|
||||
for (map<int, SoftmaxNodeV*>::iterator it = _nodeDepths.begin(); it != _nodeDepths.end(); ++it) {
|
||||
SoftmaxNodeV& nodes = *it->second;
|
||||
ushort2* meta;
|
||||
checkCudaErrors(cudaHostAlloc(&meta, sizeof(ushort2) * nodes.size(), cudaHostAllocPortable));
|
||||
int depth = it->first;
|
||||
_nodeFwdMeta[depth] = meta;
|
||||
for (int n = 0; n < nodes.size(); n++) {
|
||||
meta[n].x = nodes[n]->getLabel();
|
||||
// Setting the root to have parent 0 is ok because the fwd kernel won't
|
||||
// query this anyway when root == true.
|
||||
meta[n].y = nodes[n]->getParent() == NULL ? 0 : nodes[n]->getParent()->getLabel();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SoftmaxTree::setBwdMeta() {
|
||||
for (map<int, SoftmaxNodeV*>::iterator it = _nodeHeights.begin(); it != _nodeHeights.end(); ++it) {
|
||||
SoftmaxNodeV& nodes = *it->second;
|
||||
ushort2* meta;
|
||||
ushort** childMeta;
|
||||
checkCudaErrors(cudaHostAlloc(&meta, sizeof(ushort2) * nodes.size(), cudaHostAllocPortable));
|
||||
checkCudaErrors(cudaHostAlloc(&childMeta, sizeof(ushort*) * nodes.size(), cudaHostAllocPortable));
|
||||
int height = it->first;
|
||||
_nodeBwdMeta[height] = meta;
|
||||
_nodeChildMeta[height] = childMeta;
|
||||
for (int n = 0; n < nodes.size(); n++) {
|
||||
checkCudaErrors(cudaHostAlloc(&childMeta[n], sizeof(ushort) * nodes[n]->getChildren().size(), cudaHostAllocPortable));
|
||||
for (int c = 0; c < nodes[n]->getChildren().size(); c++) {
|
||||
childMeta[n][c] = nodes[n]->getChildren()[c]->getLabel();
|
||||
}
|
||||
meta[n].x = nodes[n]->getLabel();
|
||||
meta[n].y = nodes[n]->getChildren().size();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SoftmaxTree::setDistances() {
|
||||
_nodeHeights.clear();
|
||||
_nodeDepths.clear();
|
||||
_root->setDistances(_nodeHeights, _nodeDepths);
|
||||
}
|
||||
|
||||
void SoftmaxTree::setNodeCounts() {
|
||||
_numNodes = 0;
|
||||
_numLeaves = 0;
|
||||
_root->setNodeCounts(_numNodes, _numLeaves);
|
||||
}
|
||||
|
||||
void SoftmaxTree::setNodeSizes() {
|
||||
assert(_numLeaves > 0);
|
||||
checkCudaErrors(cudaHostAlloc(&_nodeSizes, sizeof(ushort) * _numNodes, cudaHostAllocPortable));
|
||||
_root->setSizes(_nodeSizes);
|
||||
}
|
||||
|
||||
void SoftmaxTree::finalize() {
|
||||
setDistances();
|
||||
setNodeCounts();
|
||||
setNodeSizes();
|
||||
setFwdMeta();
|
||||
setBwdMeta();
|
||||
}
|
||||
|
||||
SoftmaxNode& SoftmaxTree::getRoot() {
|
||||
return *_root;
|
||||
}
|
||||
|
||||
SoftmaxNodeV& SoftmaxTree::getNodesAtHeight(int height) {
|
||||
return *_nodeHeights[height];
|
||||
}
|
||||
|
||||
SoftmaxNodeV& SoftmaxTree::getNodesAtDepth(int depth) {
|
||||
return *_nodeDepths[depth];
|
||||
}
|
||||
|
||||
int SoftmaxTree::getHeight() const {
|
||||
return _root->getHeight();
|
||||
}
|
||||
|
||||
/*
|
||||
* A tree with only a root is taken to have depth 0.
|
||||
*/
|
||||
int SoftmaxTree::getDepth() const {
|
||||
return _nodeDepths.size() - 1;
|
||||
}
|
||||
|
||||
int SoftmaxTree::getNumLeaves() const {
|
||||
return _numLeaves;
|
||||
}
|
||||
|
||||
int SoftmaxTree::getNumNodes() const {
|
||||
return _numNodes;
|
||||
}
|
||||
|
||||
/*
|
||||
* offsets: (numNodes, numFeatures)
|
||||
* targets: (numNodes, numFeatures)
|
||||
*/
|
||||
void SoftmaxTree::makeWeights(NVMatrix& offsets, NVMatrix& targets) {
|
||||
preprocess(offsets);
|
||||
preprocess(targets);
|
||||
assert(offsets.getNumRows() == _numNodes);
|
||||
assert(targets.isSameDims(offsets));
|
||||
int numFeatures = offsets.getNumCols();
|
||||
dim3 threads = dim3(256); // 256 seems to work best on dummy binary tree
|
||||
dim3 blocks = dim3(DIVUP(numFeatures, 256), 1); // Only the root is at depth 0
|
||||
cudaFuncSetCacheConfig(kSoftmaxTreeFwd<256, true>, cudaFuncCachePreferL1);
|
||||
cudaFuncSetCacheConfig(kSoftmaxTreeFwd<256, false>, cudaFuncCachePreferL1);
|
||||
kSoftmaxTreeFwd<256, true><<<blocks, threads>>>(offsets.getDevData(), _nodeFwdMeta[0], targets.getDevData(), numFeatures);
|
||||
getLastCudaError("kSoftmaxTreeFwd: kernel execution failed");
|
||||
for (int d = 1; d <= getDepth(); d++) {
|
||||
blocks = dim3(DIVUP(numFeatures, 256), _nodeDepths[d]->size());
|
||||
kSoftmaxTreeFwd<256, false><<<blocks, threads>>>(offsets.getDevData(), _nodeFwdMeta[d], targets.getDevData(), numFeatures);
|
||||
getLastCudaError("kSoftmaxTreeFwd: kernel execution failed");
|
||||
}
|
||||
|
||||
postprocess(offsets);
|
||||
postprocess(targets);
|
||||
}
|
||||
|
||||
/*
|
||||
* grads: (numNodes, numFeatures)
|
||||
*
|
||||
* The idea is that grads contains gradients for the leaves
|
||||
* (i.e. the first numLabels rows), so this routine will
|
||||
* distribute them up the tree.
|
||||
*
|
||||
*/
|
||||
void SoftmaxTree::distributeGradients(NVMatrix& grads) {
|
||||
preprocess(grads);
|
||||
assert(grads.getNumRows() == _numNodes);
|
||||
int numFeatures = grads.getNumCols();
|
||||
// The leaves (nodes at height = 0) already have gradients computed.
|
||||
// So start at the nodes at height = 1.
|
||||
dim3 threads = dim3(512); // this block size works best :/
|
||||
cudaFuncSetCacheConfig(kSoftmaxTreeBwd<512>, cudaFuncCachePreferL1);
|
||||
for (int h = 1; h <= getHeight(); ++h) {
|
||||
dim3 blocks = dim3(DIVUP(numFeatures, 512), _nodeHeights[h]->size());
|
||||
kSoftmaxTreeBwd<512><<<blocks, threads>>>(grads.getDevData(), _nodeBwdMeta[h], _nodeChildMeta[h], numFeatures);
|
||||
getLastCudaError("kSoftmaxTreeBwd: kernel execution failed");
|
||||
}
|
||||
postprocess(grads);
|
||||
}
|
||||
|
||||
/*
|
||||
* inc := mom * inc - wc * epsW * weight + epsW * grad
|
||||
* weight := weight + inc
|
||||
*
|
||||
* weights: (numNodes, numFeatures)
|
||||
* incs: (numNodes, numFeatures)
|
||||
* grads: (numNodes , numFeatures)
|
||||
*/
|
||||
void SoftmaxTree::updateWeights(NVMatrix& weights, NVMatrix& incs, NVMatrix& grads, float epsWBase, float mom, float wcBase) {
|
||||
preprocess(weights);
|
||||
preprocess(incs);
|
||||
preprocess(grads);
|
||||
|
||||
assert(grads.getNumRows() == _numNodes);
|
||||
assert(grads.isSameDims(incs));
|
||||
assert(grads.isSameDims(weights));
|
||||
int numFeatures = grads.getNumCols();
|
||||
dim3 threads = dim3(512);
|
||||
dim3 blocks = dim3(_numNodes);
|
||||
cudaFuncSetCacheConfig(kSoftmaxTreeUpdateWeights<512>, cudaFuncCachePreferL1);
|
||||
kSoftmaxTreeUpdateWeights<512><<<blocks, threads>>>(weights.getDevData(), incs.getDevData(), grads.getDevData(),
|
||||
_nodeSizes, numFeatures, epsWBase, mom, wcBase);
|
||||
getLastCudaError("kSoftmaxTreeUpdateWeights: kernel execution failed");
|
||||
weights.transpose();
|
||||
incs.transpose();
|
||||
grads.transpose();
|
||||
}
|
||||
|
||||
void SoftmaxTree::preprocess(NVMatrix& inp) {
|
||||
inp.transpose();
|
||||
assert(!inp.isTrans());
|
||||
assert(inp.isContiguous());
|
||||
}
|
||||
|
||||
void SoftmaxTree::postprocess(NVMatrix& inp) {
|
||||
inp.transpose();
|
||||
}
|
378
src/test.cu
Normal file
378
src/test.cu
Normal file
|
@ -0,0 +1,378 @@
|
|||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <test.cuh>
|
||||
#include <layer_kernels.cuh>
|
||||
#include <multisoftmax.h>
|
||||
#include <cpuCNN.cuh>
|
||||
|
||||
static StopWatchInterface *timer = NULL;
|
||||
using namespace std;
|
||||
void init_tests(int boardNum) {
|
||||
cudaSetDevice(boardNum > -1 ? boardNum : 0);
|
||||
// cublasInit();
|
||||
NVMatrix::initCublas();
|
||||
NVMatrix::initRandom(7);
|
||||
sdkCreateTimer(&timer);
|
||||
}
|
||||
|
||||
void compareResults(Matrix& cpu, NVMatrix& gpu, const char* matrixName) {
|
||||
Matrix gpuOnCPU(cpu);
|
||||
gpu.copyToHost(gpuOnCPU);
|
||||
gpuOnCPU.subtract(cpu);
|
||||
gpuOnCPU.apply(Matrix::ABS);
|
||||
printf("Max diff between CPU/GPU matrices %s: %.6f\n", matrixName, gpuOnCPU.max());
|
||||
}
|
||||
|
||||
void test_blattice() {
|
||||
printf("===============================\n");
|
||||
printf("test_blattice\n");
|
||||
printf("===============================\n");
|
||||
|
||||
int numCases = 2;
|
||||
int numOut = 32;
|
||||
int setSize = 3;
|
||||
|
||||
cout << "numCases: " << numCases << endl;
|
||||
cout << "numOut: " << numOut << endl;
|
||||
cout << "setSize: " << setSize << endl;
|
||||
NVMatrix nvEnergies(numCases, numOut);
|
||||
Matrix energies(numCases, numOut);
|
||||
Matrix bLattice(numOut, numCases * setSize);
|
||||
|
||||
nvEnergies.randomizeUniform();
|
||||
nvEnergies.copyToHost(energies);
|
||||
//energies.randomizeUniform();
|
||||
bLattice.apply(Matrix::ZERO); // for now
|
||||
|
||||
Matrix &enMax = energies.max(1);
|
||||
energies.addVector(enMax, -1);
|
||||
|
||||
nvEnergies.copyFromHost(energies);
|
||||
NVMatrix nvBLattice(bLattice, true);
|
||||
|
||||
sdkResetTimer(&timer);
|
||||
sdkStartTimer(&timer);
|
||||
|
||||
MSMBackward(nvEnergies, nvBLattice, setSize);
|
||||
|
||||
cudaThreadSynchronize();
|
||||
sdkStopTimer(&timer);
|
||||
|
||||
printf("Energies: \n");
|
||||
nvEnergies.print(10, 5);
|
||||
|
||||
printf("GPU (partial) result:\n");
|
||||
nvBLattice.print(0, 5, 0, 5);
|
||||
printf("GPU time: %.6f msec\n", sdkGetTimerValue(&timer));
|
||||
}
|
||||
|
||||
//void test_multiSoftmaxCPU() {
|
||||
// printf("===============================\n");
|
||||
// printf("test_multiSoftmaxCPU\n");
|
||||
// printf("===============================\n");
|
||||
//
|
||||
// int numCases = 2;
|
||||
// int numOut = 5;
|
||||
// int setSize = 3;
|
||||
//
|
||||
//// int numCases = 128;
|
||||
//// int numOut = 1000;
|
||||
//// int setSize = 5;
|
||||
//
|
||||
// cout << "numCases: " << numCases << endl;
|
||||
// cout << "numOut: " << numOut << endl;
|
||||
// cout << "setSize: " << setSize << endl;
|
||||
//
|
||||
// Matrix energies(numCases, numOut);
|
||||
// Matrix B(numOut + 1, setSize + 1);
|
||||
// Matrix probs(energies);
|
||||
// energies.randomizeUniform();
|
||||
// probs.apply(Matrix::ZERO); // for now
|
||||
//
|
||||
// Matrix &enMax = energies.max(1);
|
||||
// energies.addVector(enMax, -1);
|
||||
// B.apply(Matrix::ZERO);
|
||||
//
|
||||
// sdkResetTimer(&timer);
|
||||
// sdkStartTimer(&timer);
|
||||
//
|
||||
// MultiSoftmaxCPU_T(energies, B, probs, setSize, -1);
|
||||
//
|
||||
// cudaThreadSynchronize();
|
||||
// sdkStopTimer(&timer);
|
||||
//
|
||||
// printf("Energies: \n");
|
||||
// energies.print(10, 5);
|
||||
//
|
||||
// printf("CPU (partial) result:\n");
|
||||
// probs.print(0, 5, 0, 5);
|
||||
// printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
|
||||
//}
|
||||
|
||||
void test_multiSoftmaxCPU_parallel() {
|
||||
printf("===============================\n");
|
||||
printf("test_multiSoftmaxCPU_parallel\n");
|
||||
printf("===============================\n");
|
||||
|
||||
int workers = 8;
|
||||
|
||||
int numCases = 2;
|
||||
int numOut = 5;
|
||||
int setSize = 2;
|
||||
|
||||
// int numCases = 128;
|
||||
// int numOut = 1000;
|
||||
// int setSize = 5;
|
||||
|
||||
cout << "workers: " << workers << endl;
|
||||
cout << "numCases: " << numCases << endl;
|
||||
cout << "numOut: " << numOut << endl;
|
||||
cout << "setSize: " << setSize << endl;
|
||||
|
||||
NVMatrix nvEnergies(numCases, numOut);
|
||||
Matrix energies(numCases, numOut);
|
||||
vector<Matrix*> B;
|
||||
Matrix probs(energies);
|
||||
Matrix fixed(numCases, 1);
|
||||
nvEnergies.randomizeUniform();
|
||||
nvEnergies.copyToHost(energies);
|
||||
//energies.randomizeUniform();
|
||||
probs.apply(Matrix::ZERO); // for now
|
||||
|
||||
Matrix &enMax = energies.max(1);
|
||||
energies.addVector(enMax, -1);
|
||||
|
||||
fixed.apply(Matrix::ONE);
|
||||
fixed.scale(2);
|
||||
|
||||
for (int i = 0; i < workers; i++) {
|
||||
B.push_back(new Matrix(numOut + 1, setSize + 1));
|
||||
B[i]->apply(Matrix::ONE);
|
||||
B[i]->scale(-INF);
|
||||
}
|
||||
|
||||
sdkResetTimer(&timer);
|
||||
sdkStartTimer(&timer);
|
||||
|
||||
MultiSoftmaxCPU_T_parallel(energies, B, probs, fixed, setSize, true);
|
||||
|
||||
cudaThreadSynchronize();
|
||||
sdkStopTimer(&timer);
|
||||
|
||||
printf("Energies: \n");
|
||||
energies.print(10, 10);
|
||||
|
||||
printf("CPU (partial) result:\n");
|
||||
probs.print(0, 5, 0, 10);
|
||||
printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
|
||||
}
|
||||
|
||||
SoftmaxTree* makeDummyTree(int depth) {
|
||||
int numNodes = (1 << (depth + 1)) - 1;
|
||||
int numLeaves = (numNodes + 1) / 2;
|
||||
|
||||
int idx = numNodes - 1;
|
||||
SoftmaxTree* tree = new SoftmaxTree(idx--);
|
||||
vector<SoftmaxNode*> prevLevel;
|
||||
|
||||
prevLevel.push_back(&tree->getRoot());
|
||||
while (idx >= 0) {
|
||||
int sz = prevLevel.size();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
SoftmaxNode& node = *prevLevel[0];
|
||||
SoftmaxNode& child1 = node.addChild(idx--);
|
||||
SoftmaxNode& child2 = node.addChild(idx--);
|
||||
prevLevel.push_back(&child1);
|
||||
prevLevel.push_back(&child2);
|
||||
prevLevel.erase(prevLevel.begin());
|
||||
}
|
||||
}
|
||||
tree->finalize();
|
||||
assert(tree->getNumLeaves() == numLeaves);
|
||||
assert(tree->getNumNodes() == numNodes);
|
||||
return tree;
|
||||
}
|
||||
|
||||
void test_sftree_fwd() {
|
||||
printf("===============================\n");
|
||||
printf("test_sftree_fwd\n");
|
||||
printf("===============================\n");
|
||||
|
||||
int numFeatures = 6*6*128;
|
||||
int depth = 10;
|
||||
SoftmaxTree* tree = makeDummyTree(depth);
|
||||
cout << "numFeatures: " << numFeatures << endl;
|
||||
cout << "depth: " << depth << endl;
|
||||
cout << "numNodes: " << tree->getNumNodes() << endl;
|
||||
cout << "numLabels: " << tree->getNumLeaves() << endl;
|
||||
|
||||
Matrix weights(tree->getNumNodes(), numFeatures);
|
||||
Matrix targets(tree->getNumNodes(), numFeatures);
|
||||
NVMatrix nvWeights(tree->getNumNodes(), numFeatures);
|
||||
NVMatrix nvTargets(tree->getNumNodes(), numFeatures);
|
||||
|
||||
weights.randomizeUniform();
|
||||
|
||||
nvWeights.copyFromHost(weights);
|
||||
|
||||
sdkResetTimer(&timer);
|
||||
sdkStartTimer(&timer);
|
||||
|
||||
cpuSoftmaxTreeFwd(weights.getData(), targets.getData(), numFeatures, *tree);
|
||||
|
||||
sdkStopTimer(&timer);
|
||||
printf("CPU (partial) result:\n");
|
||||
targets.print(0, 7, 0, 5);
|
||||
printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
|
||||
|
||||
sdkResetTimer(&timer);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
nvWeights.transpose();
|
||||
nvTargets.transpose();
|
||||
sdkStartTimer(&timer);
|
||||
|
||||
tree->makeWeights(nvWeights, nvTargets);
|
||||
cudaDeviceSynchronize();
|
||||
sdkStopTimer(&timer);
|
||||
|
||||
nvWeights.transpose();
|
||||
nvTargets.transpose();
|
||||
printf("GPU (partial) result:\n");
|
||||
nvTargets.print(0, 7, 0, 5);
|
||||
printf("GPU time: %.6f msec\n", sdkGetTimerValue(&timer));
|
||||
compareResults(targets, nvTargets, "targets");
|
||||
}
|
||||
|
||||
void test_sftree_bwd() {
|
||||
printf("===============================\n");
|
||||
printf("test_sftree_bwd\n");
|
||||
printf("===============================\n");
|
||||
|
||||
int numFeatures = 6*6*128;
|
||||
int depth = 10;
|
||||
SoftmaxTree* tree = makeDummyTree(depth);
|
||||
cout << "numFeatures: " << numFeatures << endl;
|
||||
cout << "depth: " << depth << endl;
|
||||
cout << "numNodes: " << tree->getNumNodes() << endl;
|
||||
cout << "numLabels: " << tree->getNumLeaves() << endl;
|
||||
|
||||
Matrix grads(tree->getNumNodes(), numFeatures);
|
||||
NVMatrix nvGrads(tree->getNumNodes(), numFeatures);
|
||||
|
||||
grads.randomizeUniform();
|
||||
|
||||
nvGrads.copyFromHost(grads);
|
||||
|
||||
sdkResetTimer(&timer);
|
||||
sdkStartTimer(&timer);
|
||||
|
||||
cpuSoftmaxTreeBwd(grads.getData(), numFeatures, *tree);
|
||||
|
||||
sdkStopTimer(&timer);
|
||||
printf("CPU (partial) result:\n");
|
||||
grads.print(0, 7, 0, 5);
|
||||
printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
|
||||
|
||||
sdkResetTimer(&timer);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
nvGrads.transpose();
|
||||
sdkStartTimer(&timer);
|
||||
|
||||
tree->distributeGradients(nvGrads);
|
||||
cudaDeviceSynchronize();
|
||||
sdkStopTimer(&timer);
|
||||
|
||||
nvGrads.transpose();
|
||||
printf("GPU (partial) result:\n");
|
||||
nvGrads.print(0, 7, 0, 5);
|
||||
printf("GPU time: %.6f msec\n", sdkGetTimerValue(&timer));
|
||||
compareResults(grads, nvGrads, "grads");
|
||||
}
|
||||
|
||||
void test_sftree_update() {
|
||||
printf("===============================\n");
|
||||
printf("test_sftree_update\n");
|
||||
printf("===============================\n");
|
||||
|
||||
float eps = 0.001, wc = 0.005, mom = 0.9;
|
||||
int numFeatures = 6*6*128;
|
||||
int depth = 10;
|
||||
SoftmaxTree* tree = makeDummyTree(depth);
|
||||
cout << "numFeatures: " << numFeatures << endl;
|
||||
cout << "depth: " << depth << endl;
|
||||
cout << "numNodes: " << tree->getNumNodes() << endl;
|
||||
cout << "numLabels: " << tree->getNumLeaves() << endl;
|
||||
|
||||
Matrix grads(tree->getNumNodes(), numFeatures);
|
||||
Matrix weights(tree->getNumNodes(), numFeatures);
|
||||
Matrix incs(tree->getNumNodes(), numFeatures);
|
||||
NVMatrix nvGrads(tree->getNumNodes(), numFeatures);
|
||||
NVMatrix nvWeights(tree->getNumNodes(), numFeatures);
|
||||
NVMatrix nvIncs(tree->getNumNodes(), numFeatures);
|
||||
|
||||
grads.randomizeUniform();
|
||||
weights.randomizeUniform();
|
||||
incs.randomizeUniform();
|
||||
|
||||
nvGrads.copyFromHost(grads);
|
||||
nvWeights.copyFromHost(weights);
|
||||
nvIncs.copyFromHost(incs);
|
||||
|
||||
sdkResetTimer(&timer);
|
||||
sdkStartTimer(&timer);
|
||||
|
||||
cpuSoftmaxTreeUpdateWeights(weights.getData(), incs.getData(), grads.getData(), numFeatures, eps, mom, wc, *tree);
|
||||
|
||||
sdkStopTimer(&timer);
|
||||
printf("CPU (partial) result:\n");
|
||||
weights.print(0, 7, 0, 5);
|
||||
printf("CPU time: %.6f msec\n", sdkGetTimerValue(&timer));
|
||||
|
||||
sdkResetTimer(&timer);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
nvGrads.transpose();
|
||||
nvWeights.transpose();
|
||||
nvIncs.transpose();
|
||||
sdkStartTimer(&timer);
|
||||
|
||||
tree->updateWeights(nvWeights, nvIncs, nvGrads, eps, mom, wc);
|
||||
cudaDeviceSynchronize();
|
||||
sdkStopTimer(&timer);
|
||||
|
||||
nvGrads.transpose();
|
||||
nvWeights.transpose();
|
||||
nvIncs.transpose();
|
||||
printf("GPU (partial) result:\n");
|
||||
nvWeights.print(0, 7, 0, 5);
|
||||
printf("GPU time: %.6f msec\n", sdkGetTimerValue(&timer));
|
||||
compareResults(weights, nvWeights, "weights");
|
||||
compareResults(incs, nvIncs, "incs");
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
int boardNum = get_board_lock();
|
||||
if (boardNum == GPU_LOCK_NO_BOARD) {
|
||||
printf("No free GPU boards!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
} else if(boardNum == GPU_LOCK_NO_SCRIPT) {
|
||||
printf("Running on default board.\n");
|
||||
} else {
|
||||
printf("Running on board %d\n", boardNum);
|
||||
}
|
||||
init_tests(boardNum);
|
||||
|
||||
// test_blattice();
|
||||
// test_multiSoftmaxCPU();
|
||||
// test_multiSoftmaxCPU_parallel();
|
||||
// test_sftree_fwd();
|
||||
// test_sftree_bwd();
|
||||
// test_mdiag();
|
||||
// test_mdiagGrad();
|
||||
return 0;
|
||||
}
|
124
src/util.cu
Normal file
124
src/util.cu
Normal file
|
@ -0,0 +1,124 @@
|
|||
/*
|
||||
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <util.cuh>
|
||||
|
||||
using namespace std;
|
||||
|
||||
stringv* getStringV(PyObject* pyList) {
|
||||
if (pyList == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
stringv* vec = new stringv();
|
||||
for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
|
||||
vec->push_back(string(PyString_AS_STRING(PyList_GET_ITEM(pyList, i))));
|
||||
}
|
||||
return vec;
|
||||
}
|
||||
|
||||
floatv* getFloatV(PyObject* pyList) {
|
||||
if (pyList == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
floatv* vec = new floatv();
|
||||
for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
|
||||
vec->push_back(PyFloat_AS_DOUBLE(PyList_GET_ITEM(pyList, i)));
|
||||
}
|
||||
return vec;
|
||||
}
|
||||
|
||||
intv* getIntV(PyObject* pyList) {
|
||||
if (pyList == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
intv* vec = new intv();
|
||||
for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
|
||||
vec->push_back(PyInt_AS_LONG(PyList_GET_ITEM(pyList, i)));
|
||||
}
|
||||
return vec;
|
||||
}
|
||||
|
||||
int* getIntA(PyObject* pyList) {
|
||||
if (pyList == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
int* arr = new int[PyList_GET_SIZE(pyList)];
|
||||
for (int i = 0; i < PyList_GET_SIZE(pyList); i++) {
|
||||
arr[i] = PyInt_AS_LONG(PyList_GET_ITEM(pyList, i));
|
||||
}
|
||||
return arr;
|
||||
}
|
||||
|
||||
MatrixV* getMatrixV(PyObject* pyList) {
|
||||
return getMatrixV(pyList, PyList_GET_SIZE(pyList));
|
||||
}
|
||||
|
||||
MatrixV* getMatrixV(PyObject* pyList, int len) {
|
||||
if (pyList == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
MatrixV* vec = new MatrixV();
|
||||
for (int i = 0; i < len; i++) {
|
||||
vec->push_back(new Matrix((PyArrayObject*)PyList_GET_ITEM(pyList, i)));
|
||||
}
|
||||
return vec;
|
||||
}
|
||||
|
||||
int pyDictGetInt(PyObject* dict, const char* key) {
|
||||
return PyInt_AS_LONG(PyDict_GetItemString(dict, key));
|
||||
}
|
||||
|
||||
intv* pyDictGetIntV(PyObject* dict, const char* key) {
|
||||
return getIntV(PyDict_GetItemString(dict, key));
|
||||
}
|
||||
|
||||
int* pyDictGetIntA(PyObject* dict, const char* key) {
|
||||
return getIntA(PyDict_GetItemString(dict, key));
|
||||
}
|
||||
|
||||
string pyDictGetString(PyObject* dict, const char* key) {
|
||||
return string(PyString_AS_STRING(PyDict_GetItemString(dict, key)));
|
||||
}
|
||||
|
||||
float pyDictGetFloat(PyObject* dict, const char* key) {
|
||||
return PyFloat_AS_DOUBLE(PyDict_GetItemString(dict, key));
|
||||
}
|
||||
|
||||
floatv* pyDictGetFloatV(PyObject* dict, const char* key) {
|
||||
return getFloatV(PyDict_GetItemString(dict, key));
|
||||
}
|
||||
|
||||
Matrix* pyDictGetMatrix(PyObject* dict, const char* key) {
|
||||
return new Matrix((PyArrayObject*)PyDict_GetItemString(dict, key));
|
||||
}
|
||||
|
||||
MatrixV* pyDictGetMatrixV(PyObject* dict, const char* key) {
|
||||
return getMatrixV(PyDict_GetItemString(dict, key));
|
||||
}
|
||||
|
||||
stringv* pyDictGetStringV(PyObject* dict, const char* key) {
|
||||
return getStringV(PyDict_GetItemString(dict, key));
|
||||
}
|
378
src/weights.cu
Normal file
378
src/weights.cu
Normal file
|
@ -0,0 +1,378 @@
|
|||
/*
|
||||
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <weights.cuh>
|
||||
#include <softmaxtree.cuh>
|
||||
#include <lr.cuh>
|
||||
#include "worker.cuh"
|
||||
|
||||
NVMatrix& Weights::operator*() const {
|
||||
return getW();
|
||||
}
|
||||
|
||||
Weights::Weights(Weights& srcWeights, LearningRateSchedule& lrs)
|
||||
: _srcWeights(&srcWeights), _lrs(&lrs), _wc(0), _wball(0), _onGPU(false), _numUpdates(0),
|
||||
_weights(NULL), _weightsInc(NULL), _weightsGrad(NULL), _cleanup(false) {
|
||||
_hWeights = &srcWeights.getCPUW();
|
||||
_hWeightsInc = &srcWeights.getCPUWInc();
|
||||
_mom = srcWeights.getMom();
|
||||
_useGrad = srcWeights.isUseGrad();
|
||||
_superEps = srcWeights.getSuperEps();
|
||||
}
|
||||
|
||||
Weights::Weights(Matrix& hWeights, Matrix& hWeightsInc, LearningRateSchedule& lrs, float wc,
|
||||
float wball, float mom, float superEps, bool useGrad, bool cleanup)
|
||||
: _srcWeights(NULL), _hWeights(&hWeights), _hWeightsInc(&hWeightsInc), _numUpdates(0),
|
||||
_lrs(&lrs), _wc(wc), _wball(wball), _mom(mom), _useGrad(useGrad), _superEps(superEps),
|
||||
_onGPU(false), _weights(NULL),_weightsInc(NULL), _weightsGrad(NULL), _cleanup(cleanup) {
|
||||
assert(_superEps <= 0 || _useGrad); // superWeights ==> useGrad
|
||||
}
|
||||
|
||||
Weights::~Weights() {
|
||||
delete _lrs;
|
||||
if (_cleanup) {
|
||||
delete _hWeights;
|
||||
delete _hWeightsInc;
|
||||
if (_srcWeights == NULL) {
|
||||
delete _weights;
|
||||
delete _weightsInc;
|
||||
delete _weightsGrad;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NVMatrix& Weights::getW() const {
|
||||
assert(_onGPU);
|
||||
return *_weights;
|
||||
}
|
||||
|
||||
NVMatrix& Weights::getInc() const {
|
||||
assert(_onGPU);
|
||||
return *_weightsInc;
|
||||
}
|
||||
|
||||
NVMatrix& Weights::getGrad() const {
|
||||
assert(_onGPU);
|
||||
return _useGrad ? *_weightsGrad : *_weightsInc;
|
||||
}
|
||||
|
||||
Matrix& Weights::getCPUW() const {
|
||||
return *_hWeights;
|
||||
}
|
||||
|
||||
Matrix& Weights::getCPUWInc() const {
|
||||
return *_hWeightsInc;
|
||||
}
|
||||
|
||||
int Weights::getNumRows() const {
|
||||
return _hWeights->getNumRows();
|
||||
}
|
||||
|
||||
int Weights::getNumCols() const {
|
||||
return _hWeights->getNumCols();
|
||||
}
|
||||
|
||||
void Weights::copyToCPU() {
|
||||
if (_srcWeights == NULL) {
|
||||
assert(_onGPU);
|
||||
_weights->copyToHost(*_hWeights);
|
||||
_weightsInc->copyToHost(*_hWeightsInc);
|
||||
}
|
||||
}
|
||||
|
||||
// This function is assumed to be called in the order in which the layers
|
||||
// were defined
|
||||
void Weights::copyToGPU() {
|
||||
assert(!_onGPU);
|
||||
if (_srcWeights == NULL) {
|
||||
_weights = _weights == NULL ? new NVMatrix() : _weights;
|
||||
_weightsInc = _weightsInc == NULL ? new NVMatrix() : _weightsInc;
|
||||
_weights->copyFromHost(*_hWeights, true);
|
||||
_weightsInc->copyFromHost(*_hWeightsInc, true);
|
||||
_weightsGrad = _useGrad ? (_weightsGrad == NULL ? new NVMatrix(*_weights) : _weightsGrad) : NULL;
|
||||
_weightsGradAvg = _superEps > 0 ? new NVMatrix() : NULL;
|
||||
_weightsGrad2Avg = _superEps > 0 ? new NVMatrix() : NULL;
|
||||
} else {
|
||||
_weights = _srcWeights->_weights;
|
||||
_weightsInc = _srcWeights->_weightsInc;
|
||||
_weightsGrad = _srcWeights->_weightsGrad;
|
||||
}
|
||||
_onGPU = true;
|
||||
}
|
||||
|
||||
#define SUPERMOM_THREADS 256
|
||||
#define SUPERMOM_BLOCKS_MAX 4096
|
||||
|
||||
/*
|
||||
* V = eps * g / (G2 - G^2 + superEps)^.5 + mom * V
|
||||
*/
|
||||
__global__ void superMomUpdate(float* V, float* g, float* G, float* G2,
|
||||
const float eps, const float mom, const float superEps, const int numElements) {
|
||||
const int tidx = blockIdx.x * SUPERMOM_THREADS + threadIdx.x;
|
||||
|
||||
for (int t = tidx; t < numElements; t += gridDim.x * SUPERMOM_THREADS) {
|
||||
V[t] = /*mom*/0.9 * V[t] + eps * __fdividef(g[t], sqrtf(G2[t] - G[t] + superEps));
|
||||
}
|
||||
}
|
||||
|
||||
// When _useGrad is false, weightsInc is assumed to contain the
|
||||
// entire, properly scaled weight increment.
|
||||
// OTHERWISE, scale your gradient by 1 / numCases only.
|
||||
// The scaling by epsW will be done in this routine.
|
||||
void Weights::update(float progress) {
|
||||
// Only true owner of weights updates
|
||||
if (_srcWeights == NULL && _lrs->getBaseRate() > 0) {
|
||||
assert(_onGPU);
|
||||
if (_superEps <= 0) {
|
||||
if (_useGrad) {
|
||||
_weightsInc->add(*_weightsGrad, _mom, _lrs->getRate(progress));
|
||||
}
|
||||
} else {
|
||||
if (!_weightsGradAvg->isSameDims(*_weightsGrad)) {
|
||||
_weightsGradAvg->resize(*_weightsGrad);
|
||||
_weightsGrad2Avg->resize(*_weightsGrad);
|
||||
_weightsGradAvg->apply(NVMatrixOps::Zero());
|
||||
_weightsGrad2Avg->apply(NVMatrixOps::Zero());
|
||||
}
|
||||
_weightsGradAvg->add(*_weightsGrad, _mom, 1 - _mom);
|
||||
_weightsGrad2Avg->applyBinary(Grad2AvgOperator(_mom), *_weightsGrad);
|
||||
// Geoff version
|
||||
|
||||
// Make sure all matrices are contiguous
|
||||
assert(_weightsGrad->isContiguous());
|
||||
assert(_weightsGradAvg->isContiguous());
|
||||
assert(_weightsGrad2Avg->isContiguous());
|
||||
assert(_weightsInc->isContiguous());
|
||||
// Make sure they all have the same transposedness
|
||||
assert(_weightsGrad->isTrans() == _weightsGradAvg->isTrans());
|
||||
assert(_weightsGradAvg->isTrans() == _weightsGrad2Avg->isTrans());
|
||||
assert(_weightsGrad2Avg->isTrans() == _weightsInc->isTrans());
|
||||
// Make sure they all have the same sizes
|
||||
assert(_weightsGrad->isSameDims(*_weightsGradAvg));
|
||||
assert(_weightsGradAvg->isSameDims(*_weightsGrad2Avg));
|
||||
assert(_weightsGrad2Avg->isSameDims(*_weightsInc));
|
||||
|
||||
int numElements = _weights->getNumElements();
|
||||
dim3 blocks(std::min(DIVUP(numElements, SUPERMOM_THREADS), SUPERMOM_BLOCKS_MAX));
|
||||
dim3 threads(SUPERMOM_THREADS);
|
||||
//float super = _superEps + 1000000*_weightsGrad2Avg->sum() / numElements;
|
||||
//printf("super: %f\n", super);
|
||||
superMomUpdate<<<blocks, threads>>>(_weightsInc->getDevData(), _weightsGrad->getDevData(),
|
||||
_weightsGradAvg->getDevData(), _weightsGrad2Avg->getDevData(),
|
||||
_lrs->getRate(progress), _mom, _superEps, numElements);
|
||||
getLastCudaError("superMomUpdate: Kernel execution failed");
|
||||
//_weightsInc->print(4,4);
|
||||
//_weightsGrad2Avg->print(5,5);exit(0);
|
||||
// Ilya version
|
||||
}
|
||||
if (_wc > 0) {
|
||||
_weightsInc->add(*_weights, -_wc * _lrs->getRate(progress));
|
||||
}
|
||||
_weights->add(*_weightsInc);
|
||||
_numUpdates = 0;
|
||||
}
|
||||
}
|
||||
|
||||
int Weights::incNumUpdates() {
|
||||
if (_srcWeights != NULL) {
|
||||
return _srcWeights->incNumUpdates();
|
||||
}
|
||||
return _numUpdates++;
|
||||
}
|
||||
|
||||
// Returns the number of times a gradient has been computed for this
|
||||
// weight matrix during the current pass (interval between two calls of update())
|
||||
// through the net. This number will only be greater than 1 if this weight matrix
|
||||
// is *shared* by multiple layers in the net.
|
||||
int Weights::getNumUpdates() const {
|
||||
if (_srcWeights != NULL) {
|
||||
return _srcWeights->getNumUpdates();
|
||||
}
|
||||
return _numUpdates;
|
||||
}
|
||||
|
||||
float Weights::getEps(float progress) const {
|
||||
return _lrs->getRate(progress);
|
||||
}
|
||||
|
||||
float Weights::getMom() const {
|
||||
return _mom;
|
||||
}
|
||||
|
||||
float Weights::getWC() const {
|
||||
return _wc;
|
||||
}
|
||||
|
||||
float Weights::getWBall() const {
|
||||
return _wball;
|
||||
}
|
||||
|
||||
bool Weights::isUseGrad() const { // is good grammar
|
||||
return _useGrad;
|
||||
}
|
||||
|
||||
bool Weights::isOwner() const {
|
||||
return _srcWeights == NULL;
|
||||
}
|
||||
|
||||
float Weights::getSuperEps() const {
|
||||
return _superEps;
|
||||
}
|
||||
|
||||
LearningRateSchedule& Weights::getLearningRateSchedule() const {
|
||||
return *_lrs;
|
||||
}
|
||||
|
||||
/*
|
||||
* ===============
|
||||
* TreeWeights
|
||||
* ===============
|
||||
*/
|
||||
TreeWeights::TreeWeights(SoftmaxTree& tree, Matrix& hWeights, Matrix& hWeightsInc, LearningRateSchedule& lrs, float wcBase, float mom)
|
||||
: _tree(&tree), Weights(hWeights, hWeightsInc, lrs, wcBase, 0, mom, 0, true) {
|
||||
assert(hWeights.isTrans());
|
||||
assert(hWeightsInc.isTrans());
|
||||
}
|
||||
|
||||
NVMatrix& TreeWeights::getW() const {
|
||||
return *_leafWeights;
|
||||
}
|
||||
|
||||
NVMatrix& TreeWeights::getInc() const {
|
||||
return *_leafInc;
|
||||
}
|
||||
|
||||
NVMatrix& TreeWeights::getGrad() const {
|
||||
return *_leafGrad;
|
||||
}
|
||||
|
||||
NVMatrix& TreeWeights::getAllW() const {
|
||||
return *_weights;
|
||||
}
|
||||
|
||||
NVMatrix& TreeWeights::getAllInc() const {
|
||||
return *_weightsInc;
|
||||
}
|
||||
|
||||
NVMatrix& TreeWeights::getAllGrad() const {
|
||||
return *_weightsGrad;
|
||||
}
|
||||
|
||||
void TreeWeights::copyToGPU() {
|
||||
assert(!_onGPU);
|
||||
Weights::copyToGPU();
|
||||
_tree->finalize();
|
||||
_effWeights.resize(*_weights);
|
||||
_leafWeights = &_effWeights.sliceCols(0, _tree->getNumLeaves());
|
||||
_leafGrad = &_weightsGrad->sliceCols(0, _tree->getNumLeaves());
|
||||
_leafInc = &_weightsInc->sliceCols(0, _tree->getNumLeaves());
|
||||
assert(_leafWeights->isView());
|
||||
makeWeights();
|
||||
}
|
||||
|
||||
int TreeWeights::getNumRows() const {
|
||||
return _tree->getNumNodes();
|
||||
}
|
||||
|
||||
void TreeWeights::update(float progress) {
|
||||
// Only true owner of weights updates
|
||||
if (_lrs->getBaseRate() > 0) {
|
||||
assert(_onGPU);
|
||||
distributeGradients();
|
||||
_tree->updateWeights(*_weights, *_weightsInc, *_weightsGrad, _lrs->getRate(progress), _mom, _wc);
|
||||
makeWeights();
|
||||
_numUpdates = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void TreeWeights::makeWeights() {
|
||||
_tree->makeWeights(*_weights, _effWeights);
|
||||
}
|
||||
|
||||
void TreeWeights::distributeGradients() {
|
||||
_tree->distributeGradients(*_weightsGrad);
|
||||
}
|
||||
|
||||
/*
|
||||
* ===============
|
||||
* DummyWeights
|
||||
* ===============
|
||||
*/
|
||||
DummyWeights::DummyWeights(Matrix& hWeights, Matrix& hWeightsInc,
|
||||
NVMatrix& weights, NVMatrix& incs, NVMatrix& grads)
|
||||
: Weights(hWeights, hWeightsInc, *new LearningRateSchedule(0), 0, 0, 0, 0, true, false) {
|
||||
_onGPU = true;
|
||||
_weights = &weights;
|
||||
_weightsInc = &incs;
|
||||
_weightsGrad = &grads;
|
||||
}
|
||||
|
||||
/*
|
||||
* ===============
|
||||
* WeightList
|
||||
* ===============
|
||||
*/
|
||||
Weights& WeightList::operator[](const int idx) const {
|
||||
return *_weightList[idx];
|
||||
}
|
||||
|
||||
WeightList::~WeightList() {
|
||||
for (int i = 0; i < _weightList.size(); i++) {
|
||||
delete _weightList[i];
|
||||
}
|
||||
}
|
||||
|
||||
WeightList::WeightList() {
|
||||
}
|
||||
|
||||
|
||||
void WeightList::addWeights(Weights& w) {
|
||||
_weightList.push_back(&w);
|
||||
}
|
||||
|
||||
|
||||
void WeightList::update(float progress) {
|
||||
for (int i = 0; i < getSize(); i++) {
|
||||
_weightList[i]->update(progress);
|
||||
}
|
||||
}
|
||||
|
||||
void WeightList::copyToCPU() {
|
||||
for (int i = 0; i < getSize(); i++) {
|
||||
_weightList[i]->copyToCPU();
|
||||
}
|
||||
}
|
||||
|
||||
void WeightList::copyToGPU() {
|
||||
for (int i = 0; i < getSize(); i++) {
|
||||
_weightList[i]->copyToGPU();
|
||||
}
|
||||
}
|
||||
|
||||
int WeightList::getSize() const {
|
||||
return _weightList.size();
|
||||
}
|
279
src/worker.cu
Normal file
279
src/worker.cu
Normal file
|
@ -0,0 +1,279 @@
|
|||
/*
|
||||
* Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <util.cuh>
|
||||
#include <worker.cuh>
|
||||
|
||||
using namespace std;
|
||||
|
||||
/*
|
||||
* ====================
|
||||
* WorkResult
|
||||
* ====================
|
||||
*/
|
||||
WorkResult::WorkResult(WorkResult::RESULTS resultType, Cost& results) : _resultType(resultType), _results(&results) {
|
||||
}
|
||||
|
||||
WorkResult::WorkResult(WorkResult::RESULTS resultType) : _resultType(resultType), _results(NULL) {
|
||||
}
|
||||
|
||||
WorkResult::~WorkResult() {
|
||||
delete _results; // delete NULL is ok
|
||||
}
|
||||
|
||||
Cost& WorkResult::getResults() const {
|
||||
return *_results;
|
||||
}
|
||||
|
||||
WorkResult::RESULTS WorkResult::getResultType() const {
|
||||
return _resultType;
|
||||
}
|
||||
|
||||
/*
|
||||
* ====================
|
||||
* Worker
|
||||
* ====================
|
||||
*/
|
||||
Worker::Worker(ConvNet& convNet) : _convNet(&convNet) {
|
||||
}
|
||||
|
||||
/*
|
||||
* ====================
|
||||
* DataWorker
|
||||
* ====================
|
||||
*/
|
||||
DataWorker::DataWorker(ConvNet& convNet, CPUData& data) : Worker(convNet), _data(&data) {
|
||||
_dp = &convNet.getDataProvider();
|
||||
_dp->setData(*_data);
|
||||
}
|
||||
|
||||
DataWorker::~DataWorker() {
|
||||
_dp->clearData();
|
||||
}
|
||||
|
||||
/*
|
||||
* ====================
|
||||
* TrainingWorker
|
||||
* ====================
|
||||
*/
|
||||
TrainingWorker::TrainingWorker(ConvNet& convNet, CPUData& data, double progress, bool test)
|
||||
: DataWorker(convNet, data), _progress(progress), _test(test) {
|
||||
}
|
||||
|
||||
// Need to setData here (as opposed to the constructor) because the constructor executes in
|
||||
// the original CPU thread, which is not the one with GPU access.
|
||||
void TrainingWorker::run() {
|
||||
_convNet->setTrainingProgress(_progress);
|
||||
Cost& batchCost = *new Cost(0);
|
||||
for (int i = 0; i < _dp->getNumMinibatches(); i++) {
|
||||
_convNet->fprop(i, _test ? PASS_TEST : PASS_TRAIN);
|
||||
_convNet->getCost(batchCost);
|
||||
|
||||
if (!_test) {
|
||||
_convNet->bprop(PASS_TRAIN);
|
||||
_convNet->updateWeights();
|
||||
}
|
||||
}
|
||||
_convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
|
||||
}
|
||||
|
||||
/*
|
||||
* ====================
|
||||
* SyncWorker
|
||||
* ====================
|
||||
*/
|
||||
SyncWorker::SyncWorker(ConvNet& convNet) : Worker(convNet) {
|
||||
}
|
||||
|
||||
void SyncWorker::run() {
|
||||
_convNet->copyToCPU();
|
||||
_convNet->getResultQueue().enqueue(new WorkResult(WorkResult::SYNC_DONE));
|
||||
}
|
||||
|
||||
/*
|
||||
* ====================
|
||||
* GradCheckWorker
|
||||
* ====================
|
||||
*/
|
||||
GradCheckWorker::GradCheckWorker(ConvNet& convNet, CPUData& data)
|
||||
: DataWorker(convNet, data) {
|
||||
}
|
||||
|
||||
void GradCheckWorker::run() {
|
||||
_convNet->checkGradients();
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* ====================
|
||||
* MultiviewTestWorker
|
||||
* ====================
|
||||
*/
|
||||
MultiviewTestWorker::MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews, Matrix& cpuProbs, const char* logregName)
|
||||
: DataWorker(convNet, data), _numViews(numViews), _cpuProbs(&cpuProbs), _logregName(logregName) {
|
||||
assert(_data->getNumCases() % _numViews == 0);
|
||||
}
|
||||
|
||||
MultiviewTestWorker::MultiviewTestWorker(ConvNet& convNet, CPUData& data, int numViews)
|
||||
: DataWorker(convNet, data), _numViews(numViews), _cpuProbs(NULL), _logregName("") {
|
||||
assert(_data->getNumCases() % _numViews == 0);
|
||||
}
|
||||
|
||||
MultiviewTestWorker::~MultiviewTestWorker() {
|
||||
delete _cpuProbs;
|
||||
}
|
||||
|
||||
void MultiviewTestWorker::run() {
|
||||
int numCasesReal = _dp->getNumCases() / _numViews;
|
||||
int numMiniReal = DIVUP(numCasesReal, _dp->getMinibatchSize());
|
||||
|
||||
Cost& batchCost = *new Cost(0);
|
||||
for (int i = 0; i < numMiniReal; i++) {
|
||||
for (int v = 0; v < _numViews; v++) {
|
||||
CPUData& mini = _dp->getDataSlice(v * numCasesReal + i * _dp->getMinibatchSize(),
|
||||
min((v + 1) * numCasesReal, v * numCasesReal + (i + 1) * _dp->getMinibatchSize()));
|
||||
_convNet->fprop(mini, v == 0 ? PASS_MULTIVIEW_TEST_START : v == _numViews - 1 ? PASS_MULTIVIEW_TEST_END : PASS_MULTIVIEW_TEST);
|
||||
}
|
||||
if (_cpuProbs != NULL) {
|
||||
LogregCostLayer& logregLayer = *dynamic_cast<LogregCostLayer*>(&_convNet->getLayer(_logregName));
|
||||
cudaSetDevice(logregLayer.getDeviceID());
|
||||
Matrix& miniProbs = _cpuProbs->sliceRows(i * _dp->getMinibatchSize(),
|
||||
min(numCasesReal, (i + 1) * _dp->getMinibatchSize()));
|
||||
NVMatrix& acts = logregLayer.getProbsAccum();
|
||||
NVMatrix acts_T;
|
||||
acts.transpose(acts_T);
|
||||
acts_T.copyToHost(miniProbs);
|
||||
|
||||
delete &miniProbs;
|
||||
}
|
||||
_convNet->getCost(batchCost);
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
_convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
|
||||
}
|
||||
|
||||
/*
|
||||
* ====================
|
||||
* FeatureWorker
|
||||
* ====================
|
||||
*/
|
||||
FeatureWorker::FeatureWorker(ConvNet& convNet, CPUData& data, MatrixV& ftrs, stringv& layerNames)
|
||||
: DataWorker(convNet, data), _ftrs(&ftrs), _layerNames(&layerNames) {
|
||||
assert(layerNames.size() == ftrs.size());
|
||||
for (int i = 0; i < layerNames.size(); i++) {
|
||||
assert(ftrs[i]->getNumRows() == data.getNumCases());
|
||||
assert(!ftrs[i]->isTrans());
|
||||
}
|
||||
}
|
||||
|
||||
FeatureWorker::~FeatureWorker() {
|
||||
for (int i = 0; i < _ftrs->size(); i++) {
|
||||
delete _ftrs->at(i);
|
||||
}
|
||||
delete _ftrs;
|
||||
delete _layerNames;
|
||||
}
|
||||
|
||||
void FeatureWorker::run() {
|
||||
|
||||
Cost& batchCost = *new Cost(0);
|
||||
|
||||
for (int i = 0; i < _dp->getNumMinibatches(); i++) {
|
||||
_convNet->fprop(i, PASS_FEATURE_GEN);
|
||||
_convNet->getCost(batchCost);
|
||||
for (int f = 0; f < _layerNames->size(); f++) {
|
||||
Layer& ftrLayer = _convNet->getLayer(_layerNames->at(f));
|
||||
int d = ftrLayer.getDeviceID();
|
||||
cudaSetDevice(d);
|
||||
Matrix& miniFtrs = _ftrs->at(f)->sliceRows(i * _dp->getMinibatchSize(),
|
||||
min(_dp->getNumCases(), (i + 1) * _dp->getMinibatchSize()));
|
||||
NVMatrix& acts = ftrLayer.getActs();
|
||||
NVMatrix acts_T;
|
||||
if (acts.isTrans()) {
|
||||
NVMatrix& soft_T = acts.getTranspose();
|
||||
soft_T.transpose(acts_T);
|
||||
delete &soft_T;
|
||||
} else {
|
||||
acts.transpose(acts_T);
|
||||
}
|
||||
acts_T.copyToHost(miniFtrs);
|
||||
delete &miniFtrs;
|
||||
}
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
_convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
|
||||
}
|
||||
|
||||
/*
|
||||
* ====================
|
||||
* DataGradWorker
|
||||
* ====================
|
||||
*/
|
||||
DataGradWorker::DataGradWorker(ConvNet& convNet, CPUData& data, Matrix& dataGrads, int dataLayerIdx, int softmaxLayerIdx)
|
||||
: DataWorker(convNet, data), _dataGrads(&dataGrads), _dataLayerIdx(dataLayerIdx), _softmaxLayerIdx(softmaxLayerIdx) {
|
||||
assert(dataGrads.getNumRows() == data.getNumCases());
|
||||
assert(!dataGrads.isTrans());
|
||||
}
|
||||
|
||||
DataGradWorker::~DataGradWorker() {
|
||||
delete _dataGrads;
|
||||
}
|
||||
|
||||
void DataGradWorker::run() {
|
||||
// DataLayer& dataLayer = *dynamic_cast<DataLayer*>(&_convNet->getLayer(_dataLayerIdx));
|
||||
// SoftmaxLayer& softmaxLayer = *dynamic_cast<SoftmaxLayer*>(&_convNet->getLayer(_softmaxLayerIdx));
|
||||
// softmaxLayer.setDoLogregGrad(false);
|
||||
// Cost& batchCost = *new Cost(0);
|
||||
// for (int i = 0; i < _dp->getNumMinibatches(); i++) {
|
||||
// _convNet->fprop(i, PASS_TEST);
|
||||
// _convNet->getCost(batchCost);
|
||||
// softmaxLayer.getActs().apply(NVMatrixOps::Log(), softmaxLayer.getActsGrad());
|
||||
//
|
||||
// softmaxLayer.getActsGrad().addScalar(1);
|
||||
// softmaxLayer.getActsGrad().scale(-1);
|
||||
// softmaxLayer.incRcvdBInputs();
|
||||
// softmaxLayer.bprop(PASS_TEST);
|
||||
//
|
||||
// Matrix& miniDataGrads = _dataGrads->sliceRows(i * _dp->getMinibatchSize(),
|
||||
// min(_dp->getNumCases(), (i + 1) * _dp->getMinibatchSize()));
|
||||
// NVMatrix& grads = dataLayer.getActsGrad();
|
||||
// NVMatrix grads_T;
|
||||
// if (grads.isTrans()) {
|
||||
// NVMatrix& soft_T = grads.getTranspose();
|
||||
// soft_T.transpose(grads_T);
|
||||
// delete &soft_T;
|
||||
// } else {
|
||||
// grads.transpose(grads_T);
|
||||
// }
|
||||
// grads_T.copyToHost(miniDataGrads);
|
||||
// delete &miniDataGrads;
|
||||
//
|
||||
// _convNet->reset();
|
||||
// }
|
||||
// cudaThreadSynchronize();
|
||||
// _convNet->getResultQueue().enqueue(new WorkResult(WorkResult::BATCH_DONE, batchCost));
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue