As someone who has previous used `Theano`

/ `Lasagne`

, I noticed immediately
when a Bengio-lab paper offered its accompanying code in `TensorFlow`

. It’s possible
that the tides are changing for `Theano`

- and I want to make sure that my
current Deep Learning Workshop repo remains relevant to what people believe they should
learn…

CNN MNIST using various Frameworks
The following is just a collection of code samples for solving CNN MNIST
(all using roughly the same network structure)
using different deep learning frameworks (without additional sugar layers) :

If you have any suggestions about other frameworks I should consider, please leave a comment.

Caffe is configured via a plain-text `.prototxt`

file, and then run on the command line with switches.

The following is from the main Caffe repo ,
with a companion tutorial :

```
name: "LeNet"
layer {
name: "data"
type: "Input"
top: "data"
input_param { shape: { dim: 64 dim: 1 dim: 28 dim: 28 } }
}
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 20
kernel_size: 5
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1"
top: "conv2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 50
kernel_size: 5
stride: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "pool2"
type: "Pooling"
bottom: "conv2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "ip1"
type: "InnerProduct"
bottom: "pool2"
top: "ip1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 500
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "ip1"
top: "ip1"
}
layer {
name: "ip2"
type: "InnerProduct"
bottom: "ip1"
top: "ip2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 10
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "prob"
type: "Softmax"
bottom: "ip2"
top: "prob"
}
```

When building a simple CNN for MNIST, note that Torch code involves `lua`

rather than a
Python interface.

The following is from the main Torch demos repo :

----------------------------------------------------------------------
-- This script shows how to train different models on the MNIST
-- dataset, using multiple optimization techniques (SGD, LBFGS)
--
-- This script demonstrates a classical example of training
-- well-known models (convnet, MLP, logistic regression)
-- on a 10-class classification problem.
--
-- It illustrates several points:
-- 1/ description of the model
-- 2/ choice of a loss function (criterion) to minimize
-- 3/ creation of a dataset as a simple Lua table
-- 4/ description of training and test procedures
--
-- Clement Farabet
----------------------------------------------------------------------
require 'torch'
require 'nn'
require 'nnx'
require 'optim'
require 'image'
require 'dataset-mnist'
require 'pl'
require 'paths'
----------------------------------------------------------------------
-- parse command-line options
--
local opt = lapp [[
-s,--save (default "logs") subdirectory to save logs
-n,--network (default "") reload pretrained network
-m,--model (default "convnet") type of model tor train: convnet | mlp | linear
-f,--full use the full dataset
-p,--plot plot while training
-o,--optimization (default "SGD") optimization: SGD | LBFGS
-r,--learningRate (default 0.05) learning rate, for SGD only
-b,--batchSize (default 10) batch size
-m,--momentum (default 0) momentum, for SGD only
-i,--maxIter (default 3) maximum nb of iterations per batch, for LBFGS
--coefL1 (default 0) L1 penalty on the weights
--coefL2 (default 0) L2 penalty on the weights
-t,--threads (default 4) number of threads
]]
-- fix seed
torch . manualSeed ( 1 )
-- threads
torch . setnumthreads ( opt . threads )
print ( '<torch> set nb of threads to ' .. torch . getnumthreads ())
-- use floats, for SGD
if opt . optimization == 'SGD' then
torch . setdefaulttensortype ( 'torch.FloatTensor' )
end
-- batch size?
if opt . optimization == 'LBFGS' and opt . batchSize < 100 then
error ( 'LBFGS should not be used with small mini-batches; 1000 is recommended' )
end
----------------------------------------------------------------------
-- define model to train
-- on the 10-class classification problem
--
classes = { '1' , '2' , '3' , '4' , '5' , '6' , '7' , '8' , '9' , '10' }
-- geometry: width and height of input images
geometry = { 32 , 32 }
if opt . network == '' then
-- define model to train
model = nn . Sequential ()
if opt . model == 'convnet' then
------------------------------------------------------------
-- convolutional network
------------------------------------------------------------
-- stage 1 : mean suppresion -> filter bank -> squashing -> max pooling
model : add ( nn . SpatialConvolutionMM ( 1 , 32 , 5 , 5 ))
model : add ( nn . Tanh ())
model : add ( nn . SpatialMaxPooling ( 3 , 3 , 3 , 3 , 1 , 1 ))
-- stage 2 : mean suppresion -> filter bank -> squashing -> max pooling
model : add ( nn . SpatialConvolutionMM ( 32 , 64 , 5 , 5 ))
model : add ( nn . Tanh ())
model : add ( nn . SpatialMaxPooling ( 2 , 2 , 2 , 2 ))
-- stage 3 : standard 2-layer MLP:
model : add ( nn . Reshape ( 64 * 3 * 3 ))
model : add ( nn . Linear ( 64 * 3 * 3 , 200 ))
model : add ( nn . Tanh ())
model : add ( nn . Linear ( 200 , # classes ))
------------------------------------------------------------
elseif opt . model == 'mlp' then
------------------------------------------------------------
-- regular 2-layer MLP
------------------------------------------------------------
model : add ( nn . Reshape ( 1024 ))
model : add ( nn . Linear ( 1024 , 2048 ))
model : add ( nn . Tanh ())
model : add ( nn . Linear ( 2048 , # classes ))
------------------------------------------------------------
elseif opt . model == 'linear' then
------------------------------------------------------------
-- simple linear model: logistic regression
------------------------------------------------------------
model : add ( nn . Reshape ( 1024 ))
model : add ( nn . Linear ( 1024 , # classes ))
------------------------------------------------------------
else
print ( 'Unknown model type' )
cmd : text ()
error ()
end
else
print ( '<trainer> reloading previously trained network' )
model = torch . load ( opt . network )
end
-- retrieve parameters and gradients
parameters , gradParameters = model : getParameters ()
-- verbose
print ( '<mnist> using model:' )
print ( model )
----------------------------------------------------------------------
-- loss function: negative log-likelihood
--
model : add ( nn . LogSoftMax ())
criterion = nn . ClassNLLCriterion ()
----------------------------------------------------------------------
-- get/create dataset
--
if opt . full then
nbTrainingPatches = 60000
nbTestingPatches = 10000
else
nbTrainingPatches = 2000
nbTestingPatches = 1000
print ( '<warning> only using 2000 samples to train quickly (use flag -full to use 60000 samples)' )
end
-- create training set and normalize
trainData = mnist . loadTrainSet ( nbTrainingPatches , geometry )
trainData : normalizeGlobal ( mean , std )
-- create test set and normalize
testData = mnist . loadTestSet ( nbTestingPatches , geometry )
testData : normalizeGlobal ( mean , std )
----------------------------------------------------------------------
-- define training and testing functions
--
-- this matrix records the current confusion across classes
confusion = optim . ConfusionMatrix ( classes )
-- log results to files
trainLogger = optim . Logger ( paths . concat ( opt . save , 'train.log' ))
testLogger = optim . Logger ( paths . concat ( opt . save , 'test.log' ))
-- training function
function train ( dataset )
-- epoch tracker
epoch = epoch or 1
-- local vars
local time = sys . clock ()
-- do one epoch
print ( '<trainer> on training set:' )
print ( "<trainer> online epoch # " .. epoch .. ' [batchSize = ' .. opt . batchSize .. ']' )
for t = 1 , dataset : size (), opt . batchSize do
-- create mini batch
local inputs = torch . Tensor ( opt . batchSize , 1 , geometry [ 1 ], geometry [ 2 ])
local targets = torch . Tensor ( opt . batchSize )
local k = 1
for i = t , math.min ( t + opt . batchSize - 1 , dataset : size ()) do
-- load new sample
local sample = dataset [ i ]
local input = sample [ 1 ]: clone ()
local _ , target = sample [ 2 ]: clone (): max ( 1 )
target = target : squeeze ()
inputs [ k ] = input
targets [ k ] = target
k = k + 1
end
-- create closure to evaluate f(X) and df/dX
local feval = function ( x )
-- just in case:
collectgarbage ()
-- get new parameters
if x ~= parameters then
parameters : copy ( x )
end
-- reset gradients
gradParameters : zero ()
-- evaluate function for complete mini batch
local outputs = model : forward ( inputs )
local f = criterion : forward ( outputs , targets )
-- estimate df/dW
local df_do = criterion : backward ( outputs , targets )
model : backward ( inputs , df_do )
-- penalties (L1 and L2):
if opt . coefL1 ~= 0 or opt . coefL2 ~= 0 then
-- locals:
local norm , sign = torch . norm , torch . sign
-- Loss:
f = f + opt . coefL1 * norm ( parameters , 1 )
f = f + opt . coefL2 * norm ( parameters , 2 ) ^ 2 / 2
-- Gradients:
gradParameters : add ( sign ( parameters ): mul ( opt . coefL1 ) + parameters : clone (): mul ( opt . coefL2 ) )
end
-- update confusion
for i = 1 , opt . batchSize do
confusion : add ( outputs [ i ], targets [ i ])
end
-- return f and df/dX
return f , gradParameters
end
-- optimize on current mini-batch
if opt . optimization == 'LBFGS' then
-- Perform LBFGS step:
lbfgsState = lbfgsState or {
maxIter = opt . maxIter ,
lineSearch = optim . lswolfe
}
optim . lbfgs ( feval , parameters , lbfgsState )
-- disp report:
print ( 'LBFGS step' )
print ( ' - progress in batch: ' .. t .. '/' .. dataset : size ())
print ( ' - nb of iterations: ' .. lbfgsState . nIter )
print ( ' - nb of function evalutions: ' .. lbfgsState . funcEval )
elseif opt . optimization == 'SGD' then
-- Perform SGD step:
sgdState = sgdState or {
learningRate = opt . learningRate ,
momentum = opt . momentum ,
learningRateDecay = 5e-7
}
optim . sgd ( feval , parameters , sgdState )
-- disp progress
xlua . progress ( t , dataset : size ())
else
error ( 'unknown optimization method' )
end
end
-- time taken
time = sys . clock () - time
time = time / dataset : size ()
print ( "<trainer> time to learn 1 sample = " .. ( time * 1000 ) .. 'ms' )
-- print confusion matrix
print ( confusion )
trainLogger : add {[ '% mean class accuracy (train set)' ] = confusion . totalValid * 100 }
confusion : zero ()
-- save/log current net
local filename = paths . concat ( opt . save , 'mnist.net' )
os.execute ( 'mkdir -p ' .. sys . dirname ( filename ))
if paths . filep ( filename ) then
os.execute ( 'mv ' .. filename .. ' ' .. filename .. '.old' )
end
print ( '<trainer> saving network to ' .. filename )
-- torch.save(filename, model)
-- next epoch
epoch = epoch + 1
end
-- test function
function test ( dataset )
-- local vars
local time = sys . clock ()
-- test over given dataset
print ( '<trainer> on testing Set:' )
for t = 1 , dataset : size (), opt . batchSize do
-- disp progress
xlua . progress ( t , dataset : size ())
-- create mini batch
local inputs = torch . Tensor ( opt . batchSize , 1 , geometry [ 1 ], geometry [ 2 ])
local targets = torch . Tensor ( opt . batchSize )
local k = 1
for i = t , math.min ( t + opt . batchSize - 1 , dataset : size ()) do
-- load new sample
local sample = dataset [ i ]
local input = sample [ 1 ]: clone ()
local _ , target = sample [ 2 ]: clone (): max ( 1 )
target = target : squeeze ()
inputs [ k ] = input
targets [ k ] = target
k = k + 1
end
-- test samples
local preds = model : forward ( inputs )
-- confusion:
for i = 1 , opt . batchSize do
confusion : add ( preds [ i ], targets [ i ])
end
end
-- timing
time = sys . clock () - time
time = time / dataset : size ()
print ( "<trainer> time to test 1 sample = " .. ( time * 1000 ) .. 'ms' )
-- print confusion matrix
print ( confusion )
testLogger : add {[ '% mean class accuracy (test set)' ] = confusion . totalValid * 100 }
confusion : zero ()
end
----------------------------------------------------------------------
-- and train!
--
while true do
-- train/test
train ( trainData )
test ( testData )
-- plot errors
if opt . plot then
trainLogger : style {[ '% mean class accuracy (train set)' ] = '-' }
testLogger : style {[ '% mean class accuracy (test set)' ] = '-' }
trainLogger : plot ()
testLogger : plot ()
end
end

The following is from a detailed blog posting ,
which is a CNN translation of the TensorFlow CNN for MNIST, since the
standard example that Microsoft gives in its CNTK tutorials focusses on a fully-connected network :

# ... loading of the MNIST data set etc ...
# ...
def create_convolutional_neural_network ( input_vars , out_dims , dropout_prob = 0.0 ):
convolutional_layer_1 = Convolution (( 5 , 5 ), 32 , strides = 1 , activation = cntk . ops . relu , pad = True )( input_vars )
pooling_layer_1 = MaxPooling (( 2 , 2 ), strides = ( 2 , 2 ), pad = True )( convolutional_layer_1 )
convolutional_layer_2 = Convolution (( 5 , 5 ), 64 , strides = 1 , activation = cntk . ops . relu , pad = True )( pooling_layer_1 )
pooling_layer_2 = MaxPooling (( 2 , 2 ), strides = ( 2 , 2 ), pad = True )( convolutional_layer_2 )
fully_connected_layer = Dense ( 1024 , activation = cntk . ops . relu )( pooling_layer_2 )
dropout_layer = Dropout ( dropout_prob )( fully_connected_layer )
output_layer = Dense ( out_dims , activation = None )( dropout_layer )
return output_layer
# Define the input to the neural network
input_vars = cntk . ops . input_variable ( image_shape , np . float32 )
# Create the convolutional neural network
output = create_convolutional_neural_network ( input_vars , output_dim , dropout_prob = 0.5 )
# Define the label as the other input parameter of the trainer
labels = cntk . ops . input_variable ( output_dim , np . float32 )
#Initialize the parameters for the trainer
train_minibatch_size = 50
learning_rate = 1e-4
momentum = 0.9
# Define the loss function
loss = cntk . ops . cross_entropy_with_softmax ( output , labels )
# Define the function that calculates classification error
label_error = cntk . ops . classification_error ( output , labels )
# Instantiate the trainer object to drive the model training
learner = cntk . adam_sgd ( output . parameters , learning_rate , momentum )
trainer = cntk . Trainer ( output , loss , label_error , [ learner ])
num_training_epoch = 1
training_progress_output_freq = 10
for epoch in range ( num_training_epoch ):
sample_count = 0
num_minibatch = 0
# loop over minibatches in the epoch
while sample_count < num_train_samples :
minibatch = train_minibatch_source . next_minibatch ( min ( train_minibatch_size , num_train_samples - sample_count ))
# Specify the mapping of input variables in the model to actual minibatch data to be trained with
data = { input_vars : minibatch [ training_features ],
labels : minibatch [ training_labels ]}
trainer . train_minibatch ( data )
sample_count += data [ labels ]. num_samples
num_minibatch += 1
# Print the training progress data
if num_minibatch % training_progress_output_freq == 0 :
training_loss = cntk . get_train_loss ( trainer )
eval_error = cntk . get_train_eval_criterion ( trainer )
print ( "Epoch %d | # of Samples: %6d | Loss: %.6f | Error: %.6f" % ( epoch , sample_count , training_loss , eval_error ))
print ( "Training Completed." , end = " \n\n " )

Finding this proved more difficult to find than expected because all the relevant Issues seem to be in Chinese…

The following is from a VGG16 CNN implementation within the main Repo :

# Definition of 'img_conv_group' is in :
# https://github.com/PaddlePaddle/Paddle/blob/master/python/paddle/trainer_config_helpers/networks.py
def small_vgg ( input_image , num_channels , num_classes ):
def __vgg__ ( ipt , num_filter , times , dropouts , num_channels_ = None ):
return img_conv_group (
input = ipt ,
num_channels = num_channels_ ,
pool_size = 2 ,
pool_stride = 2 ,
conv_num_filter = [ num_filter ] * times ,
conv_filter_size = 3 ,
conv_act = ReluActivation (),
conv_with_batchnorm = True ,
conv_batchnorm_drop_rate = dropouts ,
pool_type = MaxPooling ())
tmp = __vgg__ ( input_image , 64 , 2 , [ 0.3 , 0 ], num_channels )
tmp = __vgg__ ( tmp , 128 , 2 , [ 0.4 , 0 ])
tmp = __vgg__ ( tmp , 256 , 3 , [ 0.4 , 0.4 , 0 ])
tmp = __vgg__ ( tmp , 512 , 3 , [ 0.4 , 0.4 , 0 ])
tmp = img_pool_layer (
input = tmp , stride = 2 , pool_size = 2 , pool_type = MaxPooling ())
tmp = dropout_layer ( input = tmp , dropout_rate = 0.5 )
tmp = fc_layer (
input = tmp ,
size = 512 ,
layer_attr = ExtraAttr ( drop_rate = 0.5 ),
act = LinearActivation ())
tmp = batch_norm_layer ( input = tmp , act = ReluActivation ())
return fc_layer ( input = tmp , size = num_classes , act = SoftmaxActivation ())
from paddle.trainer_config_helpers import *
is_predict = get_config_arg ( "is_predict" , bool , False )
####################Data Configuration ##################
if not is_predict :
data_dir = './data/'
define_py_data_sources2 (
train_list = data_dir + 'train.list' ,
test_list = data_dir + 'test.list' ,
module = 'mnist_provider' ,
obj = 'process' )
######################Algorithm Configuration #############
settings (
batch_size = 128 ,
learning_rate = 0.1 / 128.0 ,
learning_method = MomentumOptimizer ( 0.9 ),
regularization = L2Regularization ( 0.0005 * 128 ))
#######################Network Configuration #############
data_size = 1 * 28 * 28
label_size = 10
img = data_layer ( name = 'pixel' , size = data_size )
# small_vgg is predined in trainer_config_helpers.network
predict = small_vgg ( input_image = img , num_channels = 1 , num_classes = label_size )
if not is_predict :
lbl = data_layer ( name = "label" , size = label_size )
inputs ( img , lbl )
outputs ( classification_cost ( input = predict , label = lbl ))
else :
outputs ( predict )

The following is from an MXNet blog posting -
there’s a large model zoo too,
but those examples have lots of common helper code factored out, whereas the blog does it straight-forwardly :

# Get the data
import numpy as np
import os
import urllib
import gzip
import struct
def download_data ( url , force_download = True ):
fname = url . split ( "/" )[ - 1 ]
if force_download or not os . path . exists ( fname ):
urllib . urlretrieve ( url , fname )
return fname
def read_data ( label_url , image_url ):
with gzip . open ( download_data ( label_url )) as flbl :
magic , num = struct . unpack ( ">II" , flbl . read ( 8 ))
label = np . fromstring ( flbl . read (), dtype = np . int8 )
with gzip . open ( download_data ( image_url ), 'rb' ) as fimg :
magic , num , rows , cols = struct . unpack ( ">IIII" , fimg . read ( 16 ))
image = np . fromstring ( fimg . read (), dtype = np . uint8 ). reshape ( len ( label ), rows , cols )
return ( label , image )
path = 'http://yann.lecun.com/exdb/mnist/'
( train_lbl , train_img ) = read_data (
path + 'train-labels-idx1-ubyte.gz' , path + 'train-images-idx3-ubyte.gz' )
( val_lbl , val_img ) = read_data (
path + 't10k-labels-idx1-ubyte.gz' , path + 't10k-images-idx3-ubyte.gz' )
# MXNet-specific code...
import mxnet as mx
def to4d ( img ):
return img . reshape ( img . shape [ 0 ], 1 , 28 , 28 ). astype ( np . float32 ) / 255
batch_size = 100
train_iter = mx . io . NDArrayIter ( to4d ( train_img ), train_lbl , batch_size , shuffle = True )
val_iter = mx . io . NDArrayIter ( to4d ( val_img ), val_lbl , batch_size )
# CNN model
data = mx . symbol . Variable ( 'data' )
# first conv layer
conv1 = mx . sym . Convolution ( data = data , kernel = ( 5 , 5 ), num_filter = 20 )
tanh1 = mx . sym . Activation ( data = conv1 , act_type = "tanh" )
pool1 = mx . sym . Pooling ( data = tanh1 , pool_type = "max" , kernel = ( 2 , 2 ), stride = ( 2 , 2 ))
# second conv layer
conv2 = mx . sym . Convolution ( data = pool1 , kernel = ( 5 , 5 ), num_filter = 50 )
tanh2 = mx . sym . Activation ( data = conv2 , act_type = "tanh" )
pool2 = mx . sym . Pooling ( data = tanh2 , pool_type = "max" , kernel = ( 2 , 2 ), stride = ( 2 , 2 ))
# first fullc layer
flatten = mx . sym . Flatten ( data = pool2 )
fc1 = mx . symbol . FullyConnected ( data = flatten , num_hidden = 500 )
tanh3 = mx . sym . Activation ( data = fc1 , act_type = "tanh" )
# second fullc
fc2 = mx . sym . FullyConnected ( data = tanh3 , num_hidden = 10 )
# softmax loss
lenet = mx . sym . SoftmaxOutput ( data = fc2 , name = 'softmax' )
# Output may vary
model = mx . model . FeedForward (
ctx = mx . gpu ( 0 ), # use GPU 0 for training, others are same as before
symbol = lenet ,
num_epoch = 10 ,
learning_rate = 0.1 )
model . fit (
X = train_iter ,
eval_data = val_iter ,
batch_end_callback = mx . callback . Speedometer ( batch_size , 200 )
)
assert model . score ( val_iter ) > 0.98 , "Low validation accuracy."

When building a simple CNN for MNIST, raw `Theano`

will typically build some helper functions
to make the process easier.

The following is from a Theano Tutorial repo by Alec Radford :

import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import numpy as np
from load import mnist
from theano.tensor.nnet.conv import conv2d
from theano.tensor.signal.downsample import max_pool_2d
srng = RandomStreams ()
def floatX ( X ):
return np . asarray ( X , dtype = theano . config . floatX )
def init_weights ( shape ):
return theano . shared ( floatX ( np . random . randn ( * shape ) * 0.01 ))
def rectify ( X ):
return T . maximum ( X , 0. )
def softmax ( X ):
e_x = T . exp ( X - X . max ( axis = 1 ). dimshuffle ( 0 , 'x' ))
return e_x / e_x . sum ( axis = 1 ). dimshuffle ( 0 , 'x' )
def dropout ( X , p = 0. ):
if p > 0 :
retain_prob = 1 - p
X *= srng . binomial ( X . shape , p = retain_prob , dtype = theano . config . floatX )
X /= retain_prob
return X
def RMSprop ( cost , params , lr = 0.001 , rho = 0.9 , epsilon = 1e-6 ):
grads = T . grad ( cost = cost , wrt = params )
updates = []
for p , g in zip ( params , grads ):
acc = theano . shared ( p . get_value () * 0. )
acc_new = rho * acc + ( 1 - rho ) * g ** 2
gradient_scaling = T . sqrt ( acc_new + epsilon )
g = g / gradient_scaling
updates . append (( acc , acc_new ))
updates . append (( p , p - lr * g ))
return updates
def model ( X , w , w2 , w3 , w4 , p_drop_conv , p_drop_hidden ):
l1a = rectify ( conv2d ( X , w , border_mode = 'full' ))
l1 = max_pool_2d ( l1a , ( 2 , 2 ))
l1 = dropout ( l1 , p_drop_conv )
l2a = rectify ( conv2d ( l1 , w2 ))
l2 = max_pool_2d ( l2a , ( 2 , 2 ))
l2 = dropout ( l2 , p_drop_conv )
l3a = rectify ( conv2d ( l2 , w3 ))
l3b = max_pool_2d ( l3a , ( 2 , 2 ))
l3 = T . flatten ( l3b , outdim = 2 )
l3 = dropout ( l3 , p_drop_conv )
l4 = rectify ( T . dot ( l3 , w4 ))
l4 = dropout ( l4 , p_drop_hidden )
pyx = softmax ( T . dot ( l4 , w_o ))
return l1 , l2 , l3 , l4 , pyx
trX , teX , trY , teY = mnist ( onehot = True )
trX = trX . reshape ( - 1 , 1 , 28 , 28 )
teX = teX . reshape ( - 1 , 1 , 28 , 28 )
X = T . ftensor4 ()
Y = T . fmatrix ()
w = init_weights (( 32 , 1 , 3 , 3 ))
w2 = init_weights (( 64 , 32 , 3 , 3 ))
w3 = init_weights (( 128 , 64 , 3 , 3 ))
w4 = init_weights (( 128 * 3 * 3 , 625 ))
w_o = init_weights (( 625 , 10 ))
noise_l1 , noise_l2 , noise_l3 , noise_l4 , noise_py_x = model ( X , w , w2 , w3 , w4 , 0.2 , 0.5 )
l1 , l2 , l3 , l4 , py_x = model ( X , w , w2 , w3 , w4 , 0. , 0. )
y_x = T . argmax ( py_x , axis = 1 )
cost = T . mean ( T . nnet . categorical_crossentropy ( noise_py_x , Y ))
params = [ w , w2 , w3 , w4 , w_o ]
updates = RMSprop ( cost , params , lr = 0.001 )
train = theano . function ( inputs = [ X , Y ], outputs = cost , updates = updates , allow_input_downcast = True )
predict = theano . function ( inputs = [ X ], outputs = y_x , allow_input_downcast = True )
for i in range ( 100 ):
for start , end in zip ( range ( 0 , len ( trX ), 128 ), range ( 128 , len ( trX ), 128 )):
cost = train ( trX [ start : end ], trY [ start : end ])
print np . mean ( np . argmax ( teY , axis = 1 ) == predict ( teX ))

When building a simple CNN for MNIST, raw `TensorFlow`

will typically build some helper functions
to make the process easier.

The following is from ddigiorg’s repo :

sess = tf . InteractiveSession ()
x = tf . placeholder ( tf . float32 , shape = [ None , 784 ])
y_ = tf . placeholder ( tf . float32 , shape = [ None , 10 ])
# Helper functions
def weight_variable ( shape ):
initial = tf . truncated_normal ( shape , stddev = 0.1 )
return tf . Variable ( initial )
def bias_variable ( shape ):
initial = tf . constant ( 0.1 , shape = shape )
return tf . Variable ( initial )
def conv2d ( x , W ):
return tf . nn . conv2d ( x , W , strides = [ 1 , 1 , 1 , 1 ], padding = 'SAME' )
def max_pool_2x2 ( x ):
return tf . nn . max_pool ( x , ksize = [ 1 , 2 , 2 , 1 ],
strides = [ 1 , 2 , 2 , 1 ], padding = 'SAME' )
# Reshape input
x_image = tf . reshape ( x , [ - 1 , 28 , 28 , 1 ])
#First Convolutional and Max Pool Layers
W_conv1 = weight_variable ([ 5 , 5 , 1 , 32 ])
b_conv1 = bias_variable ([ 32 ])
h_conv1 = tf . nn . relu ( conv2d ( x , W_conv1 ) + b_conv1 )
h_pool1 = max_pool_2x2 ( h_conv1 )
#Second Convolutional and Max Pool Layers
W_conv2 = weight_variable ([ 5 , 5 , 32 , 64 ])
b_conv2 = bias_variable ([ 64 ])
h_conv2 = tf . nn . relu ( conv2d ( h_pool1 , W_conv2 ) + b_conv2 )
h_pool2 = max_pool_2x2 ( h_conv2 )
#Densely Connected Layer
W_fcl = weight_variable ([ 7 * 7 * 64 , 1024 ])
b_fcl = bias_variable ([ 1024 ])
h_pool2_flat = tf . reshape ( h_pool2 , [ - 1 , 7 * 7 * 64 ])
h_fcl = tf . nn . relu ( tf . matmul ( h_pool2_flat , W_fcl ) + b_fcl )
#Dropout
keep_prob = tf . placeholder ( "float" )
h_fc1_drop = tf . nn . dropout ( h_fcl , keep_prob )
#Output Layer (Softmax)
W_fc2 = weight_variable ([ 1024 , 10 ])
b_fc2 = bias_variable ([ 10 ])
y_conv = tf . nn . softmax ( tf . matmul ( h_fc1_drop , W_fc2 ) + b_fc2 )
#Train and Evaluate the Model
cross_entropy = - tf . reduce_sum ( y_ * tf . log ( y_conv ))
train_step = tf . train . AdamOptimizer ( 1e-4 ). minimize ( cross_entropy )
correct_prediction = tf . equal ( tf . argmax ( y_conv , 1 ), tf . argmax ( y_ , 1 ))
accuracy = tf . reduce_mean ( tf . cast ( correct_prediction , "float" ))
sess . run ( tf . initialize_all_variables ())
for i in range ( 20000 ):
batch = mnist . train . next_batch ( 50 )
if i % 100 == 0 :
train_accuracy = accuracy . eval ( feed_dict = { x : batch [ 0 ], y_ : batch [ 1 ], keep_prob : 1.0 })
print ( "step %d, training accuracy %g" % ( i , train_accuracy ))
train_step . run ( feed_dict = { x : batch [ 0 ], y_ : batch [ 1 ], keep_prob : 0.5 })
print ( "test accuracy %g" % accuracy . eval ( feed_dict = { x : mnist . test . images , y_ : mnist . test . labels , keep_prob : 1.0 }))