# PyTorch
Gonzalo Rios (grios@dim.uchile.cl)

1. Wiki: https://en.wikipedia.org/wiki/PyTorch
2. Github: https://github.com/pytorch/pytorch
2. Docs: https://pytorch.org/
3. Cuda: https://developer.nvidia.com/cuda-downloads

# Tensors
https://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html

In [None]:
import torch

In [None]:
x = torch.empty(5, 3)
x

In [None]:
x = torch.rand(5, 3)
x

In [None]:
x.size()

In [None]:
x = torch.zeros(5, 3)
x, x.dtype

In [None]:
torch.get_default_dtype()

In [None]:
torch.set_default_dtype(torch.float64)
x = torch.zeros(5, 3)
x, x.dtype

In [None]:
torch.set_default_tensor_type(torch.FloatTensor)

In [None]:
x = torch.zeros(5, 3, dtype=torch.long)
x, x.dtype

In [None]:
x = x.new_ones(5, 3)
x, x.dtype

In [None]:
x = torch.tensor([5.5, 3])
x, x.dtype

In [None]:
x = x.new_ones(5, 3)
x, x.dtype

In [None]:
x = x.new_ones(5, 3, dtype=torch.double) # new_* methods take in sizes
x

In [None]:
y = torch.rand(5, 3)
y

In [None]:
x + y

In [None]:
x = torch.tensor(x, dtype=torch.float32)
x + y

In [None]:
torch.add(x,y)

In [None]:
result = torch.empty(5, 3)
result = torch.add(x, y)
result

In [None]:
result = torch.empty(5, 3)
torch.add(x, y, out=result)
result

In [None]:
y

In [None]:
y.add(x)

In [None]:
# Any operation that mutates a tensor in-place is post-fixed with an _. 
# For example: x.copy_(y), x.t_(), will change x.
y.add_(x)
y

In [None]:
x.t()

In [None]:
x

In [None]:
x.t_()

In [None]:
x

In [None]:
y[1:]

In [None]:
y[1:2]

In [None]:
y[1:,2:]

In [None]:
indices = torch.tensor([0, 2])
torch.index_select(y, 0, indices)

In [None]:
torch.index_select(y, 1, indices)

In [None]:
# Resizing: If you want to resize/reshape tensor, you can use torch.view:
x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8) # the size -1 is inferred from other dimensions
print(x.size(), y.size(), z.size())

In [None]:
x, y, z

In [None]:
x[0,0]+=1
x, y, z

In [None]:
z[0,0]+=1

In [None]:
x

In [None]:
x = torch.randn(1)
x, x.item(), type(x), type(x.item())

In [None]:
a = torch.ones(5)
b = a.numpy()
a, type(a), b, type(b)

In [None]:
a.add_(1)
a, b

In [None]:
import numpy as np
a = np.ones(5, dtype=np.float32)
b = torch.from_numpy(a)
a, b

In [None]:
np.add(a, 1, out=a)
a, b

In [None]:
torch.cat((b, b, b), dim=0)

In [None]:
torch.arange(10)

In [None]:
torch.linspace(0,10,10)

In [None]:
torch.eye(10)

In [None]:
torch.full((5, 5), 1.23)

# Autograd
https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html

Tensors that track history
In autograd, if any input Tensor of an operation has requires_grad=True, the computation will be tracked. After computing the backward pass, a gradient w.r.t. this tensor is accumulated into .grad attribute.

There’s one more class which is very important for autograd implementation - a Function. Tensor and Function are interconnected and build up an acyclic graph, that encodes a complete history of computation. Each variable has a .grad_fn attribute that references a function that has created a function (except for Tensors created by the user - these have None as .grad_fn).

In [None]:
import torch

In [None]:
x = torch.ones(2, 2, requires_grad=True)
x

In [None]:
x.requires_grad

In [None]:
y = x + 2
y

In [None]:
y.requires_grad

In [None]:
y.grad_fn

In [None]:
z = y * y * 3
out = z.mean()

print(z, out)

In [None]:
z.grad_fn

In [None]:
out.grad_fn

In [None]:
a = torch.randn(2, 2)
a = ((a * 3) / (a - 1))
b = (a * a).sum()
print(a.requires_grad)
print(b.grad_fn)

In [None]:
a.requires_grad_(True)
b = (a * a).sum()
print(a.requires_grad)
print(b.grad_fn)

If you want to compute the derivatives, you can call .backward() on a Tensor. If Tensor is a scalar (i.e. it holds a one element tensor), you don’t need to specify any arguments to backward(), however if it has more elements, you need to specify a grad_output argument that is a tensor of matching shape.

You should have got a matrix of 4.5. Let’s call the out Tensor “o”. We have that o=14∑izi, zi=3(xi+2)2 and zi∣∣xi=1=27. Therefore, ∂o∂xi=32(xi+2), hence ∂o∂xi∣∣xi=1=92=4.5.

In [None]:
out

In [None]:
out.backward()

In [None]:
print(x.grad)

In [None]:
print(x.requires_grad)
print((x ** 2).requires_grad)

with torch.no_grad():
 print((x ** 2).requires_grad)

In [None]:
x = torch.ones(2, 2, requires_grad=True)
y = x + 2
z = y * y * 3

z2 = y.mean()
out = z.mean()
out.backward()
#z2.backward()

In [None]:
x.grad

In [None]:
out.backward()

# Cuda

In [None]:
torch.cuda.is_available()

In [None]:
torch.cuda.device_count()

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

In [None]:
y = torch.ones_like(x, device=device)
y

In [None]:
x

In [None]:
x = x.to(device)
x

In [None]:
z = x + y
z

In [None]:
z.numpy()

In [None]:
z.detach().numpy()

In [None]:
z.cpu()

In [None]:
z = z.to('cpu', torch.double)
z

In [None]:
z.numpy()

In [None]:
z.detach().numpy()

# Testing PyTorch

In [None]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
nsize = 2
N, D_in, H, D_out = nsize*64, nsize*1000, nsize*100, nsize*1000
epoch = 5000
learning_rate = 1e-6

## Numpy

In [None]:
import numpy as np

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)


for t in range(epoch):
 # Forward pass: compute predicted y
 h = x.dot(w1)
 h_relu = np.maximum(h, 0)
 y_pred = h_relu.dot(w2)

 # Compute and print loss
 loss = np.square(y_pred - y).sum()
 #print(t, loss)

 # Backprop to compute gradients of w1 and w2 with respect to loss
 grad_y_pred = 2.0 * (y_pred - y)
 grad_w2 = h_relu.T.dot(grad_y_pred)
 grad_h_relu = grad_y_pred.dot(w2.T)
 grad_h = grad_h_relu.copy()
 grad_h[h < 0] = 0
 grad_w1 = x.T.dot(grad_h)

 # Update weights
 w1 -= learning_rate * grad_w1
 w2 -= learning_rate * grad_w2

## PyTorch

In [None]:
import torch

cuda = True
dtype = torch.float

device = torch.device("cuda") if torch.cuda.is_available() and cuda else torch.device("cpu")
device

In [None]:
# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)


for t in range(epoch):
 # Forward pass: compute predicted y
 h = x.mm(w1)
 h_relu = h.clamp(min=0)
 y_pred = h_relu.mm(w2)

 # Compute and print loss
 loss = (y_pred - y).pow(2).sum().item()
 #print(t, loss)

 # Backprop to compute gradients of w1 and w2 with respect to loss
 grad_y_pred = 2.0 * (y_pred - y)
 grad_w2 = h_relu.t().mm(grad_y_pred)
 grad_h_relu = grad_y_pred.mm(w2.t())
 grad_h = grad_h_relu.clone()
 grad_h[h < 0] = 0
 grad_w1 = x.t().mm(grad_h)

 # Update weights using gradient descent
 w1 -= learning_rate * grad_w1
 w2 -= learning_rate * grad_w2

## Autograd

In [None]:
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

for t in range(epoch):
 # Forward pass: compute predicted y using operations on Tensors; these
 # are exactly the same operations we used to compute the forward pass using
 # Tensors, but we do not need to keep references to intermediate values since
 # we are not implementing the backward pass by hand.
 y_pred = x.mm(w1).clamp(min=0).mm(w2)

 # Compute and print loss using operations on Tensors.
 # Now loss is a Tensor of shape (1,)
 # loss.item() gets the a scalar value held in the loss.
 loss = (y_pred - y).pow(2).sum()
 #print(t, loss.item())

 # Use autograd to compute the backward pass. This call will compute the
 # gradient of loss with respect to all Tensors with requires_grad=True.
 # After this call w1.grad and w2.grad will be Tensors holding the gradient
 # of the loss with respect to w1 and w2 respectively.
 loss.backward()

 # Manually update weights using gradient descent. Wrap in torch.no_grad()
 # because weights have requires_grad=True, but we don't need to track this
 # in autograd.
 # An alternative way is to operate on weight.data and weight.grad.data.
 # Recall that tensor.data gives a tensor that shares the storage with
 # tensor, but doesn't track history.
 # You can also use torch.optim.SGD to achieve this.
 with torch.no_grad():
 w1 -= learning_rate * w1.grad
 w2 -= learning_rate * w2.grad

 # Manually zero the gradients after updating weights
 w1.grad.zero_()
 w2.grad.zero_()

## Custom Function

In [None]:
class MyReLU(torch.autograd.Function):
 """
 We can implement our own custom autograd Functions by subclassing
 torch.autograd.Function and implementing the forward and backward passes
 which operate on Tensors.
 """

 @staticmethod
 def forward(ctx, input):
 """
 In the forward pass we receive a Tensor containing the input and return
 a Tensor containing the output. ctx is a context object that can be used
 to stash information for backward computation. You can cache arbitrary
 objects for use in the backward pass using the ctx.save_for_backward method.
 """
 ctx.save_for_backward(input)
 return input.clamp(min=0)

 @staticmethod
 def backward(ctx, grad_output):
 """
 In the backward pass we receive a Tensor containing the gradient of the loss
 with respect to the output, and we need to compute the gradient of the loss
 with respect to the input.
 """
 input, = ctx.saved_tensors
 grad_input = grad_output.clone()
 grad_input[input < 0] = 0
 return grad_input

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

for t in range(epoch):
 # To apply our Function, we use Function.apply method. We alias this as 'relu'.
 relu = MyReLU.apply

 # Forward pass: compute predicted y using operations; we compute
 # ReLU using our custom autograd operation.
 y_pred = relu(x.mm(w1)).mm(w2)

 # Compute and print loss
 loss = (y_pred - y).pow(2).sum()
 #print(t, loss.item())

 # Use autograd to compute the backward pass.
 loss.backward()

 # Update weights using gradient descent
 with torch.no_grad():
 w1 -= learning_rate * w1.grad
 w2 -= learning_rate * w2.grad

 # Manually zero the gradients after updating weights
 w1.grad.zero_()
 w2.grad.zero_()

# Module
https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):

 def __init__(self):
 super(Net, self).__init__()
 # 1 input image channel, 6 output channels, 5x5 square convolution
 # kernel
 self.conv1 = nn.Conv2d(1, 6, 5)
 self.conv2 = nn.Conv2d(6, 16, 5)
 
 # an affine operation: y = Wx + b
 self.fc1 = nn.Linear(16 * 5 * 5, 120)
 self.fc2 = nn.Linear(120, 84)
 self.fc3 = nn.Linear(84, 10)

 def forward(self, x):
 # Max pooling over a (2, 2) window
 x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
 # If the size is a square you can only specify a single number
 x = F.max_pool2d(F.relu(self.conv2(x)), 2)
 x = x.view(-1, self.num_flat_features(x))
 x = F.relu(self.fc1(x))
 x = F.relu(self.fc2(x))
 x = self.fc3(x)
 return x

 def num_flat_features(self, x):
 size = x.size()[1:] # all dimensions except the batch dimension
 num_features = 1
 for s in size:
 num_features *= s
 return num_features


net = Net()
print(net)

In [None]:
params = list(net.parameters())
print(len(params))

In [None]:
[k.size() for k in params]

In [None]:
params

In [None]:
net.fc1.bias

In [None]:
input_ = torch.randn(1, 1, 32, 32)
out = net(input_)
print(out)

In [None]:
net.zero_grad()
out.backward(torch.randn(1, 10))

In [None]:
net.fc1.bias.grad

In [None]:
output = net(input_)
output

In [None]:
target = torch.arange(1, 11, dtype=torch.float).view(1, -1) # a dummy target, make it the same shape as output
target

In [None]:
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

In [None]:
print(loss.grad_fn) # MSELoss
print(loss.grad_fn.next_functions[0][0]) # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0]) # ReLU

In [None]:
net.zero_grad() # zeroes the gradient buffers of all parameters

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

In [None]:
learning_rate = 0.01
for f in net.parameters():
 f.data.sub_(f.grad.data * learning_rate)

In [None]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad() # zero the gradient buffers
output = net(input_)
loss = criterion(output, target)
print(loss)
loss.backward()
optimizer.step() # Does the update

In [None]:
target

In [None]:
net(input_)

In [None]:
net(input_).detach().numpy()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.plot(target.numpy().T, label='target')
plt.plot(net(input_).detach().numpy().T, label='prediction')
plt.legend()

In [None]:
for i in torch.arange(100, dtype=torch.long):
 optimizer.zero_grad() # zero the gradient buffers
 output = net(input_)
 loss = criterion(output, target)
 print(i.item(),'->', loss.item())
 loss.backward()
 optimizer.step() # Does the update

In [None]:
plt.plot(target.numpy().T, label='target')
plt.plot(net(input_).detach().numpy().T, label='prediction')
plt.legend()

##### Part of the model on CPU and part on the GPU
https://pytorch.org/tutorials/beginner/former_torchies/parallelism_tutorial.html

In [None]:
device = torch.device("cuda:0")

class DistributedModel(nn.Module):

 def __init__(self):
 super().__init__(
 embedding=nn.Embedding(1000, 10),
 rnn=nn.Linear(10, 10).to(device),
 )

 def forward(self, x):
 # Compute embedding on CPU
 x = self.embedding(x)

 # Transfer to GPU
 x = x.to(device)

 # Compute RNN on GPU
 x = self.rnn(x)
 return x

# Training a Classifier

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms

In [None]:
transform = transforms.Compose(
 [transforms.ToTensor(),
 transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
 download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
 shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
 download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
 shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

# functions to show an image


def imshow(img):
 img = img / 2 + 0.5 # unnormalize
 npimg = img.numpy()
 plt.imshow(np.transpose(npimg, (1, 2, 0)))


# get some random training images
dataiter = iter(trainloader)
images, labels = dataiter.next()

# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join('%5s' % classes[labels[j]] for j in range(4)))

In [None]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
 def __init__(self):
 super(Net, self).__init__()
 self.conv1 = nn.Conv2d(3, 6, 5)
 self.pool = nn.MaxPool2d(2, 2)
 self.conv2 = nn.Conv2d(6, 16, 5)
 self.fc1 = nn.Linear(16 * 5 * 5, 120)
 self.fc2 = nn.Linear(120, 84)
 self.fc3 = nn.Linear(84, 10)

 def forward(self, x):
 x = self.pool(F.relu(self.conv1(x)))
 x = self.pool(F.relu(self.conv2(x)))
 x = x.view(-1, 16 * 5 * 5)
 x = F.relu(self.fc1(x))
 x = F.relu(self.fc2(x))
 x = self.fc3(x)
 return x


net = Net()

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [None]:
for epoch in range(1): # loop over the dataset multiple times

 running_loss = 0.0
 for i, data in enumerate(trainloader, 0):
 # get the inputs
 inputs, labels = data

 # zero the parameter gradients
 optimizer.zero_grad()

 # forward + backward + optimize
 outputs = net(inputs)
 loss = criterion(outputs, labels)
 loss.backward()
 optimizer.step()

 # print statistics
 running_loss += loss.item()
 if i % 2000 == 1999: # print every 2000 mini-batches
 print('[%d, %5d] loss: %.3f' %
 (epoch + 1, i + 1, running_loss / 2000))
 running_loss = 0.0

print('Finished Training')

In [None]:
dataiter = iter(testloader)

In [None]:
images, labels = dataiter.next()

# print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))

outputs = net(images)
_, predicted = torch.max(outputs, 1)
print('Predicted: ', ' '.join('%5s' % classes[predicted[j]] for j in range(4)))


In [None]:
correct = 0
total = 0
with torch.no_grad():
 for data in testloader:
 images, labels = data
 outputs = net(images)
 _, predicted = torch.max(outputs.data, 1)
 total += labels.size(0)
 correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
 100 * correct / total))

In [None]:
class_correct = list(0. for i in range(10))
class_total = list(0. for i in range(10))
with torch.no_grad():
 for data in testloader:
 images, labels = data
 outputs = net(images)
 _, predicted = torch.max(outputs, 1)
 c = (predicted == labels).squeeze()
 for i in range(4):
 label = labels[i]
 class_correct[label] += c[i].item()
 class_total[label] += 1


for i in range(10):
 print('Accuracy of %5s : %2d %%' % (
 classes[i], 100 * class_correct[i] / class_total[i]))

# Training a Module with Autograd on Cuda thanks to PyTorch

In [None]:
import torch
from torch import nn
import numpy as np
from matplotlib import pyplot as plt
from scipy.special import gamma
from scipy.linalg import toeplitz
from tqdm import tqdm
import pandas as pd
from IPython import display

cuda = True
device = torch.device("cuda") if torch.cuda.is_available() and cuda else torch.device("cpu")
device

In [None]:
plt.rcParams['figure.figsize'] = (20,8)

In [None]:
zero = torch.tensor([0], device=device)
one = torch.tensor([1.0], device=device)
two = torch.tensor([2.0], device=device)

## Define Model

In [None]:
class Linear(nn.Module):
 def __init__(self, ninputs=1, nparams=1):
 super(Linear, self).__init__()
 self.ninputs = ninputs
 self.c = nn.Parameter(torch.Tensor(nparams, ninputs, 1, device=device))
 
 #inicializar
 self.c.data.uniform_(0, 10)

 def forward(self, x):
 return torch.mul(self.c, x)

class AR1Weight(nn.Module):
 def __init__(self, ninputs=1, nparams=1):
 super(AR1Weight, self).__init__()
 self.decay = nn.Parameter(torch.Tensor(nparams , ninputs, 1, 1, device=device)) 
 # shape = (nparams, ninputs, in_chanels/groups, time)
 #inicializar
 self.decay.data.uniform_(0, 1)
 
 def forward(self, x):
 #retorna weights para usar conv1d 
 #x.shape = (nparams, ninputs, time)
 time = torch.arange(x.shape[2]-1, -1, -1, dtype=torch.float, device=device) # tiempo invertido
 return torch.mul(one-self.decay, torch.pow(self.decay, time))#.view(-1, 1, x.shape[2])

class ConvTime(nn.Module):
 def __init__(self, weight):
 super(ConvTime, self).__init__()
 self.weight = weight
 
 
 def forward(self, x):
 #x.shape = (nparams, ninputs, time)
 nparams = x.shape[0]
 ninputs = x.shape[1]
 time = x.shape[2]
 x_conv = torch.zeros(1, nparams, ninputs, 2*time-1, device=device)
 x_conv[0, :, :, (time-1):] = x
 
 return torch.nn.functional.conv1d(x_conv.view(1, -1, x_conv.shape[3]), self.weight(x).view(-1, 1, x.shape[2]),
 padding=0, groups=nparams*ninputs)[0,:,:].view(x.shape).to(device)
 
 
class Model(nn.Module):
 def __init__(self, weight, linear):
 super(Model, self).__init__()
 self.weight = weight
 self.linear = linear
 self.conv = ConvTime(weight)

 def forward(self, x):
 return self.conv(self.linear(x)).sum(dim=1).t()

## Generate Data from Target Model

In [None]:
level_noise = 2.0
ninputs = 50 #2
time = 1000 #

In [None]:
x = torch.rand(1, ninputs, time, device=device) # inputs 

model_real = Model(AR1Weight(ninputs).to(device), 
 Linear(ninputs).to(device)).to(device)

y_real_array = model_real(x).detach().cpu().numpy().copy()
y_real_array_noise = np.float32(y_real_array+np.random.normal(scale=level_noise, size=y_real_array.shape))
y_real = torch.tensor(y_real_array_noise).to(device)


plt.plot(y_real_array, label='real')
plt.plot(y_real_array_noise, label='noisy')
plt.legend()

## Model to Train

In [None]:
model_train = Model(AR1Weight(ninputs).to(device),
 Linear(ninputs).to(device)).to(device)

plt.plot(y_real_array, label='real')
plt.plot(model_train(x).detach().cpu().numpy(), label='pred_init')
plt.legend()

In [None]:
niter = 1000
optimizer = torch.optim.Adam(model_train.parameters(), lr=1e-2)
loss_iter = torch.empty(niter, device=device)

In [None]:
ntimes = torch.tensor(time, dtype=torch.float, device=device)

progress = tqdm(range(niter))
for t in progress:
 
 y_pred = model_train(x)
 
 loss_array = torch.div(torch.pow(y_pred - y_real, two), ntimes)
 loss_total = torch.sum(loss_array)
 optimizer.zero_grad()
 loss_total.backward()
 optimizer.step()
 
 loss_iter.put_(torch.tensor(t, device=device), loss_total)

## Loss function

In [None]:
plt.plot(loss_iter.detach().cpu().numpy(), label='loss')
plt.legend()

## Estimation

In [None]:
plt.plot(model_real(x).detach().cpu().numpy(), label='real')
plt.plot(model_train(x).detach().cpu().numpy(), label='pred_init')
plt.legend()