2. Neural Transfer

This matters because style transfer is a compact example of using pretrained networks for optimization over inputs rather than over weights. It helps clarify what feature representations capture and how losses can target perceptual structure.

2.1. Input images

[1]:

%matplotlib inline

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from PIL import Image
import matplotlib.pyplot as plt

import torchvision.transforms as transforms
import torchvision.models as models

import copy

np.random.seed(37)
torch.manual_seed(37)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

def get_device():
    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_image_size():
    imsize = 512 if torch.cuda.is_available() else 128
    return imsize

def get_loader():
    image_size = get_image_size()
    loader = transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor()])
    return loader

def get_unloader():
    unloader = transforms.ToPILImage()
    return unloader

def image_loader(image_name):
    device = get_device()
    image = Image.open(image_name)
    # fake batch dimension required to fit network's input dimensions
    loader = get_loader()
    image = loader(image).unsqueeze(0)
    return image.to(device, torch.float)

def imshow(tensor, title=None):
    image = tensor.cpu().clone()  # we clone the tensor to not do changes on it
    image = image.squeeze(0)      # remove the fake batch dimension
    unloader = get_unloader()
    image = unloader(image)
    plt.imshow(image)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)

style_img = image_loader("./styles/picasso-01.jpg")
content_img = image_loader("./styles/dancing.jpg")
input_img = content_img.clone()

assert style_img.size() == content_img.size(), \
    f'size mismatch, style {style_img.size()}, content {content_img.size()}'

plt.ion()

plt.figure()
imshow(input_img, title='Input Image')

plt.figure()
imshow(style_img, title='Style Image')

plt.figure()
imshow(content_img, title='Content Image')

2.2. Loss functions

2.2.1. Content loss

[2]:

class ContentLoss(nn.Module):

    def __init__(self, target,):
        super(ContentLoss, self).__init__()
        # we 'detach' the target content from the tree used
        # to dynamically compute the gradient: this is a stated value,
        # not a variable. Otherwise the forward method of the criterion
        # will throw an error.
        self.target = target.detach()

    def forward(self, input):
        self.loss = F.mse_loss(input, self.target)
        return input

2.2.2. Style loss

[3]:

def gram_matrix(input):
    a, b, c, d = input.size()  # a=batch size(=1)
    # b=number of feature maps
    # (c,d)=dimensions of a f. map (N=c*d)

    features = input.view(a * b, c * d)  # resise F_XL into \hat F_XL

    G = torch.mm(features, features.t())  # compute the gram product

    # we 'normalize' the values of the gram matrix
    # by dividing by the number of element in each feature maps.
    return G.div(a * b * c * d)

class StyleLoss(nn.Module):

    def __init__(self, target_feature):
        super(StyleLoss, self).__init__()
        self.target = gram_matrix(target_feature).detach()

    def forward(self, input):
        G = gram_matrix(input)
        self.loss = F.mse_loss(G, self.target)
        return input

2.3. Model

[4]:

device = get_device()

cnn = models.vgg19(pretrained=True).features.to(device).eval()

2.4. Normalization

[5]:

class Normalization(nn.Module):
    def __init__(self, mean, std):
        super(Normalization, self).__init__()
        # .view the mean and std to make them [C x 1 x 1] so that they can
        # directly work with image Tensor of shape [B x C x H x W].
        # B is batch size. C is number of channels. H is height and W is width.
        self.mean = torch.tensor(mean).view(-1, 1, 1)
        self.std = torch.tensor(std).view(-1, 1, 1)

    def forward(self, img):
        # normalize img
        return (img - self.mean) / self.std

cnn_normalization_mean = torch.tensor([0.485, 0.456, 0.406]).to(device)
cnn_normalization_std = torch.tensor([0.229, 0.224, 0.225]).to(device)

2.5. Loss

[6]:

content_layers_default = ['conv_4']
style_layers_default = ['conv_1', 'conv_2', 'conv_3', 'conv_4', 'conv_5']

def get_style_model_and_losses(cnn, normalization_mean, normalization_std,
                               style_img, content_img,
                               content_layers=content_layers_default,
                               style_layers=style_layers_default):
    cnn = copy.deepcopy(cnn)

    # normalization module
    normalization = Normalization(normalization_mean, normalization_std).to(device)

    # just in order to have an iterable access to or list of content/syle
    # losses
    content_losses = []
    style_losses = []

    # assuming that cnn is a nn.Sequential, so we make a new nn.Sequential
    # to put in modules that are supposed to be activated sequentially
    model = nn.Sequential(normalization)

    i = 0  # increment every time we see a conv
    for layer in cnn.children():
        if isinstance(layer, nn.Conv2d):
            i += 1
            name = 'conv_{}'.format(i)
        elif isinstance(layer, nn.ReLU):
            name = 'relu_{}'.format(i)
            # The in-place version doesn't play very nicely with the ContentLoss
            # and StyleLoss we insert below. So we replace with out-of-place
            # ones here.
            layer = nn.ReLU(inplace=False)
        elif isinstance(layer, nn.MaxPool2d):
            name = 'pool_{}'.format(i)
        elif isinstance(layer, nn.BatchNorm2d):
            name = 'bn_{}'.format(i)
        else:
            raise RuntimeError('Unrecognized layer: {}'.format(layer.__class__.__name__))

        model.add_module(name, layer)

        if name in content_layers:
            # add content loss:
            target = model(content_img).detach()
            content_loss = ContentLoss(target)
            model.add_module("content_loss_{}".format(i), content_loss)
            content_losses.append(content_loss)

        if name in style_layers:
            # add style loss:
            target_feature = model(style_img).detach()
            style_loss = StyleLoss(target_feature)
            model.add_module("style_loss_{}".format(i), style_loss)
            style_losses.append(style_loss)

    # now we trim off the layers after the last content and style losses
    for i in range(len(model) - 1, -1, -1):
        if isinstance(model[i], ContentLoss) or isinstance(model[i], StyleLoss):
            break

    model = model[:(i + 1)]

    return model, style_losses, content_losses

2.6. Optimizer

[7]:

def get_input_optimizer(input_img):
    # this line to show that input is a parameter that requires a gradient
    optimizer = optim.LBFGS([input_img.requires_grad_()])
    return optimizer

2.7. Transfer

[8]:

import warnings
from collections import namedtuple

RESULTS = namedtuple('RESULTS', 'run style content')
results = []

def run_style_transfer(cnn, normalization_mean, normalization_std,
                       content_img, style_img, input_img, num_steps=600,
                       style_weight=1000000, content_weight=1):
    model, style_losses, content_losses = get_style_model_and_losses(cnn,
        normalization_mean, normalization_std, style_img, content_img)
    optimizer = get_input_optimizer(input_img)

    run = [0]
    while run[0] <= num_steps:

        def closure():
            # correct the values of updated input image
            input_img.data.clamp_(0, 1)

            optimizer.zero_grad()
            model(input_img)
            style_score = 0
            content_score = 0

            for sl in style_losses:
                style_score += sl.loss
            for cl in content_losses:
                content_score += cl.loss

            style_score *= style_weight
            content_score *= content_weight

            loss = style_score + content_score
            loss.backward()

            run[0] += 1
            results.append(RESULTS(run[0], style_score.item(), content_score.item()))

            if run[0] % 10 == 0:
                s_score = style_score.item()
                c_score = content_score.item()

                print(f'[{run[0]}/{num_steps}] Style Loss {s_score:.4f}, Content Loss {c_score}')
            return style_score + content_score

        optimizer.step(closure)

    # a last correction...
    input_img.data.clamp_(0, 1)

    return input_img

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    output = run_style_transfer(cnn, cnn_normalization_mean, cnn_normalization_std,
                            content_img, style_img, input_img)

[10/600] Style Loss 5683.9438, Content Loss 28.085737228393555
[20/600] Style Loss 1513.7101, Content Loss 32.92472839355469
[30/600] Style Loss 839.8792, Content Loss 33.72367477416992
[40/600] Style Loss 495.5242, Content Loss 33.512489318847656
[50/600] Style Loss 356.6638, Content Loss 33.42991256713867
[60/600] Style Loss 269.5034, Content Loss 33.4761962890625
[70/600] Style Loss 216.9316, Content Loss 33.47385787963867
[80/600] Style Loss 163.2883, Content Loss 33.37232208251953
[90/600] Style Loss 135.9556, Content Loss 33.303672790527344
[100/600] Style Loss 109.1580, Content Loss 33.22367477416992
[110/600] Style Loss 91.8495, Content Loss 33.01668930053711
[120/600] Style Loss 79.5773, Content Loss 32.93888854980469
[130/600] Style Loss 68.3634, Content Loss 32.801029205322266
[140/600] Style Loss 57.8147, Content Loss 32.63578796386719
[150/600] Style Loss 49.9333, Content Loss 32.42307662963867
[160/600] Style Loss 44.3781, Content Loss 32.22199630737305
[170/600] Style Loss 38.0064, Content Loss 32.04217529296875
[180/600] Style Loss 34.1079, Content Loss 31.90805435180664
[190/600] Style Loss 37.2641, Content Loss 31.57277488708496
[200/600] Style Loss 27.3096, Content Loss 31.456771850585938
[210/600] Style Loss 24.5254, Content Loss 31.218570709228516
[220/600] Style Loss 22.1505, Content Loss 30.969505310058594
[230/600] Style Loss 20.1192, Content Loss 30.759197235107422
[240/600] Style Loss 18.0730, Content Loss 30.493648529052734
[250/600] Style Loss 16.4035, Content Loss 30.251787185668945
[260/600] Style Loss 15.0762, Content Loss 29.994197845458984
[270/600] Style Loss 13.7020, Content Loss 29.754240036010742
[280/600] Style Loss 12.5971, Content Loss 29.49911117553711
[290/600] Style Loss 11.6450, Content Loss 29.278596878051758
[300/600] Style Loss 10.8885, Content Loss 29.027385711669922
[310/600] Style Loss 10.3600, Content Loss 28.788755416870117
[320/600] Style Loss 9.4420, Content Loss 28.623498916625977
[330/600] Style Loss 8.8160, Content Loss 28.381839752197266
[340/600] Style Loss 8.2037, Content Loss 28.1502628326416
[350/600] Style Loss 7.7651, Content Loss 27.919567108154297
[360/600] Style Loss 7.2097, Content Loss 27.767620086669922
[370/600] Style Loss 6.7861, Content Loss 27.577590942382812
[380/600] Style Loss 6.4192, Content Loss 27.3933162689209
[390/600] Style Loss 6.0287, Content Loss 27.208782196044922
[400/600] Style Loss 5.7178, Content Loss 27.02840805053711
[410/600] Style Loss 5.4190, Content Loss 26.81221580505371
[420/600] Style Loss 5.0295, Content Loss 26.66666030883789
[430/600] Style Loss 4.7181, Content Loss 26.474632263183594
[440/600] Style Loss 4.4720, Content Loss 26.35723876953125
[450/600] Style Loss 4.2306, Content Loss 26.198747634887695
[460/600] Style Loss 4.0172, Content Loss 26.03799819946289
[470/600] Style Loss 3.7802, Content Loss 25.899450302124023
[480/600] Style Loss 4.3728, Content Loss 25.672344207763672
[490/600] Style Loss 3.3955, Content Loss 25.647024154663086
[500/600] Style Loss 3.2563, Content Loss 25.517745971679688
[510/600] Style Loss 3.1030, Content Loss 25.393512725830078
[520/600] Style Loss 3.0081, Content Loss 25.26365852355957
[530/600] Style Loss 2.8389, Content Loss 25.16840362548828
[540/600] Style Loss 2.7011, Content Loss 25.061481475830078
[550/600] Style Loss 2.5967, Content Loss 24.921926498413086
[560/600] Style Loss 2.4828, Content Loss 24.828998565673828
[570/600] Style Loss 2.3790, Content Loss 24.728757858276367
[580/600] Style Loss 2.2858, Content Loss 24.615867614746094
[590/600] Style Loss 2.2063, Content Loss 24.523954391479492
[600/600] Style Loss 2.1230, Content Loss 24.429014205932617
[610/600] Style Loss 2.0673, Content Loss 24.335756301879883
[620/600] Style Loss 1.9885, Content Loss 24.26019287109375

2.8. Results

[9]:

x = [r.run for r in results]
y1 = [r.style for r in results]
y2 = [r.content for r in results]

fig, ax1 = plt.subplots(figsize=(10, 5))

color = 'tab:red'
ax1.plot(x, y1, color=color)
ax1.set_ylabel('Style Loss', color=color)
ax1.tick_params(axis='y', labelcolor=color)

color = 'tab:blue'
ax2 = ax1.twinx()
ax2.plot(x, y2, color=color)
ax2.set_ylabel('Content Loss', color=color)
ax2.tick_params(axis='y', labelcolor=color)

2.9. Visualize

[10]:

plt.figure()
imshow(output, title='Output Image')

# sphinx_gallery_thumbnail_number = 4
plt.ioff()
plt.show()