经典的卷积神经网络

LeNet

模型

大小\(1\times32\times32\)的灰度图像，通过\(6\times1\times5\times5\)的filter，此时输出图像维度为\(6\times28\times28\)，

通过Sigmoid激活函数后，再经过\(2\times2\)的平均池化（\(\mathrm{stride=2}\)），此时输出图像维度为\(6\times14\times14\)；

通过\(16\times6\times5\times5\)的filter，此时输出图像的维度为\(16\times10\times10\)，

通过Sigmoid激活函数后，再经过\(2\times2\)的平均池化（\(\mathrm{stride=2}\)），此时输出图像维度为\(16\times5\times5\)

通过\(120\times16\times5\times5\)的filter，此时输出图像维度为\(120\times1\times1\)，也就是每个通道处只剩下一个像素点，我们可以当成全连接层的输入，

将\(120\times1\times1\)的图像平铺成一维向量，注意，对于\(N\)个样本，此时的输入图像维度是\(N\times120\times1\times1\)，使用nn.Flatten()即可平铺为\(N\times120\)的矩阵

通过nn.Linear(120, 84)的全连接层，再通过nn.Linear(84, 10)的全连接层，通过Softmax后就是对10类预测的概率

torchvision.datasets.MNIST数据集是手写数字识别，一共10个类别，输入图像维度为\(1\times28\times28\)，因此，我们需要将其先padding到\(1\times32\times32\)的维度，可以在第一个卷积中实现，也可以在预处理中实现

代码

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

class LeNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1)
        self.pool1 = nn.AvgPool2d(2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5)
        self.pool2 = nn.AvgPool2d(2)
        self.conv3 = nn.Conv2d(16, 120, kernel_size=5)
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(120, 84)
        self.linear2 = nn.Linear(84, 10)
        
    def forward(self, x):
        x = F.sigmoid(self.conv1(x))
        x = self.pool1(x)
        x = F.sigmoid(self.conv2(x))
        x = self.pool2(x)
        x = self.flatten(self.conv3(x))
        x = F.sigmoid(self.linear1(x))
        return self.linear2(x)
    
def train(model, loss_function, device, optimizer, data_iter):
    model.train()
    Loss_epoch = 0
    num = 0
    for X, y in data_iter:
        X, y = X.to(device), y.to(device)
        y_hat = model(X)
        loss = loss_function(y_hat, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        Loss_epoch += loss.item() * len(y)
        num += len(y)
    return Loss_epoch / num
        
def test(model, device, data_iter):
    model.eval()
    num = 0
    right_num = 0
    with torch.no_grad():
        for X, y in data_iter:
            X, y = X.to(device), y.to(device)
            y_hat = model(X).argmax(dim=1)
            right_num += (y_hat == y).sum().item()
            num += len(y)
    return right_num / num
        
    
transform = transforms.Compose([
    transforms.Pad(padding=2),
    transforms.ToTensor()
])

batch_size = 256
train_data = datasets.FashionMNIST(root='./data', train=True, transform=transform, download=False)
test_data = datasets.FashionMNIST(root='./data', train=False, transform=transform, download=False)
train_iter = DataLoader(dataset=train_data, shuffle=True, batch_size=batch_size, num_workers=4)
test_iter = DataLoader(dataset=test_data, shuffle=False, batch_size=batch_size, num_workers=4)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LeNet().to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

num_epochs = 10
for epoch in range(num_epochs):
    loss = train(model, loss_function, device, optimizer, train_iter)
    print(f'epoch  {epoch + 1}, Loss {loss}')
    
print(f'Test Accuracy {test(model, device, test_iter)*100:.2f}%')

AlexNet

模型

相比于LeNet，AlexNet神经网络模型深度更深，有8层（5个卷积层和3个全连接层），而LeNet只有5层（2个卷积和3个全连接层）

AlexNet使用了ReLU作为激活函数，而LeNet使用的是Sigmoid激活函数

AlexNet在卷积层后使用了LRN，这种技术模仿了生物神经系统的“侧抑制”机制，但后来人们发现并没有什么作用

AlexNet在训练过程中使用了数据增强技术，如图像翻转、裁剪和颜色变换等，这些技术有效地减少了过拟合，提高了模型的泛化能力

AlexNet在全连接层使用了Dropout技术，这是一种正则化方法，可以在训练过程中随机丢弃一些神经元，以减少模型复杂度，防止过拟合

代码

由于ImageNet数据集太大，训练成本较高，这里我们选用CIFAR10数据集

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time

class AlexNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 96, kernel_size=11, stride=4)
        self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.conv2 = nn.Conv2d(96, 256, kernel_size=5, padding=2)
        self.pool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.conv3 = nn.Conv2d(256, 384, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(384, 384, kernel_size=3, padding=1)
        self.conv5 = nn.Conv2d(384, 256, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(6400, 4096)
        self.dropout = nn.Dropout(p=0.5)
        self.linear2 = nn.Linear(4096, 4096)
        self.linear3 = nn.Linear(4096, 10)
        
    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv5(x))
        x = self.pool3(x)
        x = self.flatten(x)
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = F.relu(self.linear2(x))
        x = self.dropout(x)
        x = self.linear3(x)
        
        return x
    
def train(data_iter, model, device, loss_function, optimizer):
    start_time = time.time()
    model.train()
    L = 0
    num = len(data_iter.dataset)
    for X, y in data_iter:
        X, y = X.to(device), y.to(device)
        y_hat = model(X)
        loss = loss_function(y_hat, y)
        num += len(y)
        L += loss.item() * len(y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    samples_per_second = num / (time.time() - start_time)
    return (L / num, samples_per_second)

def test(data_iter, model, device):
    model.eval()
    with torch.no_grad():
        num = len(data_iter.dataset)
        right_num = 0
        for X, y in data_iter:
            X, y = X.to(device), y.to(device)
            y_hat = model(X).argmax(dim=1)
            right_num += (y_hat == y).sum().item()
    return right_num / num

def init_weight(m):
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight)
        nn.init.zeros_(m.bias)
        
transform = transforms.Compose([
    transforms.Resize(size=(224, 224)),
    transforms.ToTensor()
])
batch_size = 256
train_data = datasets.CIFAR10(root='./data', transform=transform, train=True, download=False)
test_data = datasets.CIFAR10(root='./data', transform=transform, train=False, download=False)
train_iter = DataLoader(dataset=train_data, shuffle=True, batch_size=batch_size, num_workers=4)
test_iter = DataLoader(dataset=test_data, shuffle=False, batch_size=batch_size, num_workers=4)

net = AlexNet()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net.apply(init_weight)
net.to(device)

optimizer = optim.Adam(net.parameters(), lr=0.0001)
loss_function = nn.CrossEntropyLoss()

num_epochs = 10
for epoch in range(num_epochs):
    loss, samples_per_second = train(train_iter, net, device, loss_function, optimizer)
    print(f'epoch {epoch + 1}, Loss {loss:.5f}, {samples_per_second:.1f} examples/sec')
    
print(f'test accuracy {test(test_iter, net, device)*100:.2f}%')

VGG

模型

相比于AlexNet，VGG(Visual Geometry Group)统一使用\(3\times3\)的小卷积核和\(2\times2\)的池化层，网络结构更加简单规范

VGG将多个卷积层和一个池化层打包成一个模块，可以用模块的方式构建神经网络

原始VGG网络有5个卷积块，其中前两个块各有一个卷积层，后三个块各包含两个卷积层，第一个模块有64个输出通道，每个后续模块将输出通道数量翻倍，直到该数字达到512。由于该网络使用8个卷积层和3个全连接层，因此它通常被称为VGG-11

VGG块不通过卷积层减小图像的宽高，VGG的卷积层通常采用same卷积，保证输出的图像宽高尺寸和输入的图像宽高尺寸相同。VGG块通过最大池化层（最大汇聚层）减小图像的宽高

代码

使用函数将一个VGG块打包，包装到nn.Sequential()中，

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time

class VGG11(nn.Module):
    def __init__(self):
        super().__init__()
        self.block1 = self.__vgg_block(1, 3, 64)
        self.block2 = self.__vgg_block(1, 64, 128)
        self.block3 = self.__vgg_block(2, 128, 256)
        self.block4 = self.__vgg_block(2, 256, 512)
        self.block5 = self.__vgg_block(2, 512, 512)

        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(512*7*7, 4096)
        self.dropout = nn.Dropout(p=0.5)
        self.linear2 = nn.Linear(4096, 4096)
        self.linear3 = nn.Linear(4096, 10)

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.block5(x)
        x = self.flatten(x)

        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = F.relu(self.linear2(x))
        x = self.dropout(x)
        x = self.linear3(x)

        return x

    def __vgg_block(self, num_convs, in_channels, out_channels):
        layers = []
        for _ in range(num_convs):
            layers.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
            layers.append(nn.ReLU())
            in_channels = out_channels
        layers.append(nn.MaxPool2d(kernel_size=2))

        return nn.Sequential(*layers)
    
def train(data_iter, model, device, loss_function, optimizer):
    start_time = time.time()
    model.train()
    num = len(data_iter.dataset)
    L = 0
    for X, y in data_iter:
        X, y = X.to(device), y.to(device)
        y_hat = model(X)
        loss = loss_function(y_hat, y)
        L += loss.item() * len(y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    samples_per_second = num / (time.time() - start_time)
    return (L / num, samples_per_second)

def test(data_iter, model, device):
    model.eval()
    with torch.no_grad():
        num = len(data_iter.dataset)
        right_num = 0
        for X, y in data_iter:
            X, y = X.to(device), y.to(device)
            y_hat = model(X).argmax(dim=1)
            right_num += (y_hat == y).sum().item()

    return right_num / num


def init_weight(m):
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight)
        nn.init.zeros_(m.bias)
        
transform = transforms.Compose([
    transforms.Resize(size=(224, 224)),
    transforms.ToTensor()
])
batch_size = 128

train_data = datasets.CIFAR10(root='../AlexNet/data', train=True, transform=transform, download=False)
test_data = datasets.CIFAR10(root='../AlexNet/data', train=False, transform=transform, download=False)

train_iter = DataLoader(train_data, shuffle=True, batch_size=batch_size, num_workers=4)
test_iter = DataLoader(test_data, shuffle=False, batch_size=batch_size, num_workers=4)

net = VGG11()
net.apply(init_weight)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.0001)

num_epochs = 10
for epoch in range(num_epochs):
    loss, samples_per_second = train(train_iter, net, device, loss_function, optimizer)
    print(f'Epoch {epoch + 1}, Loss {loss:.5f}, {samples_per_second:.1f} examples/sec')
    
print(f'test accuracy {test(test_iter, net, device)*100:.2f}%')

NiN

模型

NiN(Network in Network)，NiN引入了\(1\times1\)的卷积核，这种卷积核可以在不改变空间维度的情况下，增加网络的深度和非线性能力，同时减少参数数量。在网络的末端，NiN使用全局平均池化层来代替传统的全连接层。这种方法不仅减少了参数数量，还使得模型更加灵活，能够适应不同尺寸的输入。

\(1\times1\)卷积核实际上就是将每个像素当成一个样本，通道当成一个像素的特征，应用全连接层。

LeNet、AlexNet、VGG都有一个共同的设计模式：通过一系列的卷积层和池化层来提取空间特征结构，然后通过全连接层对特征的表征进行处理。然而，如果使用了全连接层，可能会完全放弃表征的空间结构。NiN提供了一个非常简单的解决方法：在每个像素的通道上分别使用多层感知机

代码

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time

class NiN(nn.Module):
    def __init__(self):
        super().__init__()
        self.nin1 = self.__nin_block(3, 96, kernel_size=11, stride=4)
        self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.nin2 = self.__nin_block(96, 256, kernel_size=5, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.nin3 = self.__nin_block(256, 384, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.nin4 = self.__nin_block(384, 10, kernel_size=3, padding=1)
        self.pool4 = nn.AdaptiveAvgPool2d(output_size=(1, 1))
        self.flatten = nn.Flatten()

    def forward(self, x):
        x = self.pool1(self.nin1(x))
        x = self.pool2(self.nin2(x))
        x = self.pool3(self.nin3(x))
        x = self.pool4(self.nin4(x))
        return self.flatten(x)

    def __nin_block(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels, 1),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels, 1),
            nn.ReLU()
        )
        
def train(data_iter, model, device, loss_function, optimizer):
    start_time = time.time()
    model.train()
    num = len(data_iter.dataset)
    L = 0
    for X, y in data_iter:
        X, y = X.to(device), y.to(device)
        y_hat = model(X)
        loss = loss_function(y_hat, y)
        L += loss.item() * len(y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    samples_per_second = num / (time.time() - start_time)
    return (L / num, samples_per_second)

def test(data_iter, model, device):
    model.eval()
    with torch.no_grad():
        num = len(data_iter.dataset)
        right_num = 0
        for X, y in data_iter:
            X, y = X.to(device), y.to(device)
            y_hat = model(X).argmax(dim=1)
            right_num += (y_hat == y).sum().item()

    return right_num / num

def init_weight(m):
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight)
        nn.init.zeros_(m.bias)
        
transform = transforms.Compose([
    transforms.Resize(size=(224, 224)),
    transforms.ToTensor()
])
batch_size = 128

train_data = datasets.CIFAR10(root='../AlexNet/data', train=True, transform=transform, download=False)
test_data = datasets.CIFAR10(root='../AlexNet/data', train=False, transform=transform, download=False)

train_iter = DataLoader(train_data, shuffle=True, batch_size=batch_size, num_workers=4)
test_iter = DataLoader(test_data, shuffle=False, batch_size=batch_size, num_workers=4)

net = NiN()
net.apply(init_weight)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.0001)

num_epochs = 10
for epoch in range(num_epochs):
    loss, samples_per_second = train(train_iter, net, device, loss_function, optimizer)
    print(f'Epoch {epoch + 1}, Loss {loss:.5f}, {samples_per_second:.1f} examples/sec')
    
print(f'test accuracy {test(test_iter, net, device)*100:.2f}%')

GoogLeNet

模型

在GoogLeNet中，基本的卷积块被称为Inception块(Inception block)

GoogLeNet的核心是Inception模块。每个Inception模块都包含了多个并行的卷积层和池化层，它们以不同的方式处理输入数据，并将结果拼接起来。这种设计允许网络在多个尺度上捕获特征，增强了特征提取的能力。

Inception模块中广泛使用了1x1卷积，这种卷积可以减少特征图的深度，降低计算量，同时增加非线性。

GoogLeNet的一个重点是解决了什么样大小的卷积核最合适的问题，该论文的一个观点是有时使用不同大小的卷积核组合是有利的

Inception块由四条并行路线组成。前三条路径使用窗口大小为\(1\times1\)、\(3\times3\)、和\(5\times5\)的卷积层，从不同空间大小中提取信息。中间两条路径在输入上执行\(1\times1\)卷积，以减少通道数，从而降低模型的复杂性。第四条路径使用\(3\times3\)最大池化层，然后使用\(1\times1\)卷积层来改变通道数。这四条路径都使用合适的填充来使输入与输出的高和宽一致，最后我们将每条线路的输出在通道维度上连结，并构成Inception块的输出。在Inception块中，通常调整的超参数是每层输出通道数。

GoogLeNet一共使用9个Inception块和全局平均池化层的堆叠来生成其估计值，Inception块之间的最大池化层可降低维度。第一个模块类似于AlexNet和LeNet，Inception块的组合从VGG继承，全局平均池化层避免了在最后使用全连接层

代码

先构建Inception块然后构建GoogLeNet

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time

class Inception(nn.Module):
    def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
        super().__init__(**kwargs)
        
        # 线路1，单1x1卷积层
        self.p1 = nn.Conv2d(in_channels=in_channels, out_channels=c1, kernel_size=1)

        # 线路2，1x1卷积层后接3x3卷积层
        self.p2_1 = nn.Conv2d(in_channels=in_channels, out_channels=c2[0], kernel_size=1)
        self.p2_2 = nn.Conv2d(in_channels=c2[0], out_channels=c2[1], kernel_size=3, padding=1)

        # 线路3，1x1卷积层后接5x5卷积层
        self.p3_1 = nn.Conv2d(in_channels=in_channels, out_channels=c3[0], kernel_size=1)
        self.p3_2 = nn.Conv2d(in_channels=c3[0], out_channels=c3[1], kernel_size=3, padding=1)

        # 线路4，3x3最大池化层后接1x1卷积层
        self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        self.p4_2 = nn.Conv2d(in_channels=in_channels, out_channels=c4, kernel_size=1)

    def forward(self, x):
        p1 = F.relu(self.p1(x))
        p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))
        p3 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
        p4 = F.relu(self.p4_2(self.p4_1(x)))

        return torch.cat((p1, p2, p3, p4), dim=1)

class GoogLeNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.b1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        self.b2 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=192, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        self.b3 = nn.Sequential(
            Inception(192, 64, (96, 128), (16, 32), 32),
            Inception(256, 128, (128, 192), (32, 96), 64),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        self.b4 = nn.Sequential(
            Inception(480, 192, (96, 208), (16, 48), 64),
            Inception(512, 160, (112, 224), (24, 64), 64),
            Inception(512, 128, (128, 256), (24, 64), 64),
            Inception(512, 112, (144, 288), (32, 64), 64),
            Inception(528, 256, (160, 320), (32, 128), 128),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        self.b5 = nn.Sequential(
            Inception(832, 256, (160, 320), (32, 128), 128),
            Inception(832, 384, (192, 384), (48, 128), 128),
            nn.AdaptiveAvgPool2d(output_size=(1, 1)),
            nn.Flatten()
        )
        self.linear = nn.Linear(1024, 10)

    def forward(self, x):
        x = self.b1(x)
        x = self.b2(x)
        x = self.b3(x)
        x = self.b4(x)
        x = self.b5(x)
        return self.linear(x)
    
def train(data_iter, model, device, loss_function, optimizer):
    start_time = time.time()
    model.train()
    num = len(data_iter.dataset)
    L = 0
    for X, y in data_iter:
        X, y = X.to(device), y.to(device)
        y_hat = model(X)
        loss = loss_function(y_hat, y)
        L += loss.item() * len(y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return (L / num, num / (time.time() - start_time))

def test(data_iter, model, device):
    model.eval()
    num = len(data_iter.dataset)
    right_num = 0
    with torch.no_grad():
        for X, y in data_iter:
            X, y = X.to(device), y.to(device)
            y_hat = model(X).argmax(dim=1)
            right_num += (y_hat == y).sum().item()
    return right_num / num

def init_weight(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight)
        nn.init.zeros_(m.bias)
        
transform = transforms.Compose([
    transforms.Resize(size=(224, 224)),
    transforms.ToTensor()
])
batch_size = 256

train_data = datasets.CIFAR10(root='./AlexNet/data', transform=transform, train=True, download=False)
test_data = datasets.CIFAR10(root='./AlexNet/data', transform=transform, train=False, download=False)

train_iter = DataLoader(dataset=train_data, shuffle=True, batch_size=batch_size, num_workers=4)
test_iter = DataLoader(dataset=test_data, shuffle=False, batch_size=batch_size, num_workers=4)

net = GoogLeNet()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net.apply(init_weight)
net.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.0002)

num_epochs = 10

for epoch in range(num_epochs):
    loss, samples_per_second = train(train_iter, net, device, loss_function, optimizer)
    print(f'Epoch {epoch + 1}, Loss {loss:.5f}, {samples_per_second:.2f} examples/sec')
    
    
print(f'Test accuracy {test(test_iter, net, device)*100:.2f}%')

ResNet

模型

代码

import torch
import torch.nn as nn
import torch.nn.functional as F

# 定义一个基本的残差块
class BasicBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm(out_channels)
        self.downsample = downsample
        
    def forward(self, x):
        identity = x
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.downsample is not None:
            identity = self.downsample(x)
            
        out += identity
        out = self.relu(out)
        
        return out
    
# 定义一个Bottlen残差块
class Bottleneck(nn.Module):
    expansion = 4
    
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels*self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_channels*self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        
    def forward(self, x):
        identity = x
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        
        out = self.conv3(out)
        out = self.bn3(out)
        
        if self.downsample is not None:
            identity = self.downsample(x)
        
        out += identity
        out = self.relu(out)
        
        return out

# 定义ResNet模型
class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=1000):
        super().__init__()
        self.in_channels = 64
        # 初始的卷积层，将3通道的图像转换为64通道
        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        
    def _make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        # 如果步长不为1或者输入通道数不等于输出通道数，需要下采样
        if stride != 1 or self.in_channels != out_channels * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels * block.expansion)
            )

深度学习 > PyTorch

#人工智能 #神经网络 #深度学习 #PyTorch

经典的卷积神经网络

https://blog.shinebook.net/2025/03/09/人工智能/pytorch/经典的卷积神经网络/

作者

发布于

2025年3月9日

许可协议

迁移学习上一篇

torchvision 下一篇