DonaldRauscher.com

A Blog About D4T4 & M47H

Building and Deploying a Deep Learning Model Part 2: Building the Custom Estimator

09 September ’18

This is part 2 in a 3-part series (part 1, part 3) on building and deploying a deep learning model for the popular ACL 2011 IMDB dataset. In this part, I build a custom estimator in Tensorflow.

===

A few details on the model itself:

  • I used cosine annealing to reduce the learning rate throughout training
  • I used dropout to counteract overfitting and batch normalization before each activation layer
  • I used leaky ReLU rather than regular ReLU to mitigate the "dying ReLU" problem where neurons get stuck in negative states
  • I leveraged transfer learning, using Glove to initialize my word embedding
  • Rather than using bag-of-words which ignores the structure of sentences, I used a 1D convolution layer to model the interaction between words and their neighbors


Initialize word embeddings with GloVe

# get vocabulary
vocab = tft_output.vocabulary_by_name('vocab')
vocab_size = len(vocab)
# load glove embeddings
embedding_size = 200
glove_embeddings = {}

with open('glove/glove.twitter.27B.{}d.txt'.format(embedding_size), mode='r') as f:  
    for line in f:
        values = line.strip().split()
        w = values[0]
        vectors = np.asarray(values[1:], dtype='float32')
        glove_embeddings[w] = vectors
# create initialized embedding matrix
embedding_matrix = truncnorm.rvs(a=-2, b=2, size=(vocab_size+1, embedding_size))

glove_np = pd.DataFrame(glove_embeddings).values
glove_mu, glove_std = np.mean(glove_np), np.std(glove_np)

for i, w in enumerate(vocab):
    try:
        embedding_matrix[i] = np.clip((glove_embeddings[w] - glove_mu)/glove_std, -2, 2)
    except KeyError:
        pass

embedding_matrix = embedding_matrix / math.sqrt(embedding_size)

def embedding_initializer(shape=None, dtype=tf.float32, partition_info=None):  
    assert dtype is tf.float32
    return embedding_matrix

Build classifier

# input function
def input_fn(input_file_pattern, num_epochs=None, batch_size=25, shuffle=True, prefetch=1):  
    input_file_names = glob.glob(input_file_pattern)

    ds = tf.data.TFRecordDataset(input_file_names)
    ds = ds.cache()

    if shuffle:
        ds = ds.apply(tf.contrib.data.shuffle_and_repeat(buffer_size=1000, count=num_epochs))
    else:
        ds = ds.repeat(num_epochs)

    ds = ds.apply(tf.contrib.data.map_and_batch(
        map_func=lambda x: tf.parse_single_example(x, feature_spec), 
        batch_size=batch_size,
        num_parallel_calls=multiprocessing.cpu_count()
    ))

    if prefetch > 0:
        ds = ds.prefetch(prefetch)

    features = ds.make_one_shot_iterator().get_next()
    labels = features.pop('label')
    return features, labels

train_input_fn = functools.partial(input_fn,
                                   input_file_pattern=wildcard(TRAIN_TRANSFORMED_PATH),
                                   num_epochs=1)

test_input_fn = functools.partial(input_fn,
                                  input_file_pattern=wildcard(TEST_TRANSFORMED_PATH),
                                  num_epochs=1)
# create estimator spec
def make_model(features, labels, mode, params, config):

    # hyperparameters
    dropout = params['dropout']
    conv_filters = params['conv_filters']
    dense_units = params['dense_units']
    learning_rate_start = params['learning_rate_start']
    learning_rate_steps = params['learning_rate_steps']

    # flag if training
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    # set up feature columns
    terms = features['terms_indices']

    terms_shape = terms.dense_shape
    terms_shape = tf.stack([terms_shape[0], tf.where(terms_shape[1] < 3, tf.constant(3, dtype=tf.int64), terms_shape[1])], axis=0)

    terms = tf.sparse_to_dense(terms.indices, terms_shape, terms.values, default_value=vocab_size)
    terms_embed_seq = tf.contrib.layers.embed_sequence(terms, vocab_size=vocab_size+1, embed_dim=embedding_size, initializer=embedding_initializer)

    # build graph
    net = terms_embed_seq
    net = tf.layers.dropout(net, rate=dropout, training=is_training)
    net = tf.layers.conv1d(inputs=net, filters=conv_filters, kernel_size=3, strides=1, activation=tf.nn.leaky_relu)
    net = tf.reduce_max(input_tensor=net, axis=1)      
    net = tf.layers.dropout(net, rate=dropout, training=is_training)
    net = tf.layers.batch_normalization(net, training=is_training)
    net = tf.layers.dense(net, units=dense_units, activation=tf.nn.leaky_relu)
    logits = tf.layers.dense(net, 2)

    # compute predictions
    predicted_classes = tf.argmax(logits, 1)
    predicted_probs = tf.nn.softmax(logits)

    # generate predictions
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'class': predicted_classes,
            'prob': predicted_probs
        }

        export_outputs = {
          'predict': tf.estimator.export.PredictOutput(outputs=predictions)
        }

        return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)

    # compute loss
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    # create training op with cosine annealing for learning rate
    if mode == tf.estimator.ModeKeys.TRAIN:
        global_step = tf.train.get_global_step()

        learning_rate = tf.train.cosine_decay(learning_rate=learning_rate_start, global_step=global_step, 
                                              alpha=0.05, decay_steps=learning_rate_steps)

        optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)
        optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, 5.0)


        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step=global_step)

        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

    # compute evaluation metrics
    eval_metric_ops = {
        'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_classes),
        'auc': tf.metrics.auc(labels=labels, predictions=predicted_probs[:, 1])
    }
    return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops)

Train classifier

# build classifier
!rm -Rf $MODEL_LOG

epoch_size = 25000
num_epochs = 5
batch_size = 10
num_steps = epoch_size * num_epochs / batch_size // 1000 * 1000

params = dict(
    dropout=0.2,
    conv_filters=500,
    dense_units=100,
    learning_rate_start=0.1,
    learning_rate_steps=num_steps
)

ckpt_config = tf.estimator.RunConfig(keep_checkpoint_max=num_epochs)

classifier = tf.estimator.Estimator(model_fn=make_model,
                                    params=params,
                                    model_dir=MODEL_LOG,
                                    config=ckpt_config)
# train classifier
train_stats = []
for i in range(num_epochs):
    print("Starting epoch {}/{}...".format(i+1, num_epochs))
    classifier.train(input_fn=lambda: train_input_fn(batch_size=batch_size))
    ckpt = classifier.latest_checkpoint()
    train_auc = classifier.evaluate(input_fn=lambda: train_input_fn())['auc']
    test_auc = classifier.evaluate(input_fn=lambda: test_input_fn())['auc']
    train_stats.append((ckpt, train_auc, test_auc))

train_stats = pd.DataFrame(train_stats, columns=['ckpt', 'train_auc', 'test_auc'])
Starting epoch 1/5...
Starting epoch 2/5...
Starting epoch 3/5...
Starting epoch 4/5...
Starting epoch 5/5...

Evaluate classifier

# plot train stats
ind = np.arange(len(train_stats)) + 1
width = 0.35

fig, ax = plt.subplots()
train_bar = ax.bar(ind - width/2, train_stats['train_auc'].round(4), width, color='SkyBlue', label='Train')
test_bar = ax.bar(ind + width/2, train_stats['test_auc'].round(4), width,  color='IndianRed', label='Test')

# adds labels to a bar chart series
def autolabel(ax, rects, xpos='center'):
    ha = {'center': 'center', 'right': 'left', 'left': 'right'}
    offset = {'center': 0.5, 'right': 0.57, 'left': 0.43}  # x_txt = x + w*off
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height,
                '{}'.format(height), ha=ha[xpos], va='bottom')

autolabel(ax, train_bar, "center")
autolabel(ax, test_bar, "center")

ax.set_ylabel('AUC')
ax.set_xlabel('Epochs')
ax.set_xticks(ind)
ax.legend()
ax.set_ylim(0.8, 1.1)

plt.show()

png

# overall stats
best_ckpt = train_stats.sort_values(by=['test_auc'], ascending=False)['ckpt'].values[0]

train_stats = classifier.evaluate(input_fn=train_input_fn, checkpoint_path=best_ckpt)
test_stats = classifier.evaluate(input_fn=test_input_fn, checkpoint_path=best_ckpt)

train_stats = pd.DataFrame.from_dict(train_stats, orient='index', columns=['train'])
test_stats = pd.DataFrame.from_dict(test_stats, orient='index', columns=['test'])
stats = train_stats.join(test_stats)
stats
train test
loss 0.088654 0.230451
auc 0.997005 0.969034
global_step 12500.000000 12500.000000
accuracy 0.973600 0.911200

Export

def serving_input_fn():
    review = tf.placeholder(dtype=tf.string)
    label = tf.zeros(dtype=tf.int64, shape=[1, 1]) # just a placeholder

    transformed_features = tft_output.transform_raw_features({'review': review, 'label': label})

    return tf.estimator.export.ServingInputReceiver(transformed_features, {'review': review})


export_path = classifier.export_savedmodel(export_dir_base='exports',
                                           serving_input_receiver_fn=serving_input_fn,
                                           checkpoint_path=best_ckpt)

export_path = export_path.decode('utf-8')

Link to all code: https://github.com/donaldrauscher/movie-reviews-tf