经典的卷积神经网络

LeNet

模型

LeNet-5

大小\(1\times32\times32\)的灰度图像,通过\(6\times1\times5\times5\)的filter,此时输出图像维度为\(6\times28\times28\)

通过Sigmoid激活函数后,再经过\(2\times2\)的平均池化(\(\mathrm{stride=2}\)),此时输出图像维度为\(6\times14\times14\)

通过\(16\times6\times5\times5\)的filter,此时输出图像的维度为\(16\times10\times10\)

通过Sigmoid激活函数后,再经过\(2\times2\)的平均池化(\(\mathrm{stride=2}\)),此时输出图像维度为\(16\times5\times5\)

通过\(120\times16\times5\times5\)的filter,此时输出图像维度为\(120\times1\times1\),也就是每个通道处只剩下一个像素点,我们可以当成全连接层的输入,

\(120\times1\times1\)的图像平铺成一维向量,注意,对于\(N\)个样本,此时的输入图像维度是\(N\times120\times1\times1\),使用nn.Flatten()即可平铺为\(N\times120\)的矩阵

通过nn.Linear(120, 84)的全连接层,再通过nn.Linear(84, 10)的全连接层,通过Softmax后就是对10类预测的概率

torchvision.datasets.MNIST数据集是手写数字识别,一共10个类别,输入图像维度为\(1\times28\times28\),因此,我们需要将其先padding到\(1\times32\times32\)的维度,可以在第一个卷积中实现,也可以在预处理中实现

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

class LeNet(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1)
self.pool1 = nn.AvgPool2d(2)
self.conv2 = nn.Conv2d(6, 16, kernel_size=5)
self.pool2 = nn.AvgPool2d(2)
self.conv3 = nn.Conv2d(16, 120, kernel_size=5)
self.flatten = nn.Flatten()
self.linear1 = nn.Linear(120, 84)
self.linear2 = nn.Linear(84, 10)

def forward(self, x):
x = F.sigmoid(self.conv1(x))
x = self.pool1(x)
x = F.sigmoid(self.conv2(x))
x = self.pool2(x)
x = self.flatten(self.conv3(x))
x = F.sigmoid(self.linear1(x))
return self.linear2(x)

def train(model, loss_function, device, optimizer, data_iter):
model.train()
Loss_epoch = 0
num = 0
for X, y in data_iter:
X, y = X.to(device), y.to(device)
y_hat = model(X)
loss = loss_function(y_hat, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
Loss_epoch += loss.item() * len(y)
num += len(y)
return Loss_epoch / num

def test(model, device, data_iter):
model.eval()
num = 0
right_num = 0
with torch.no_grad():
for X, y in data_iter:
X, y = X.to(device), y.to(device)
y_hat = model(X).argmax(dim=1)
right_num += (y_hat == y).sum().item()
num += len(y)
return right_num / num


transform = transforms.Compose([
transforms.Pad(padding=2),
transforms.ToTensor()
])

batch_size = 256
train_data = datasets.FashionMNIST(root='./data', train=True, transform=transform, download=False)
test_data = datasets.FashionMNIST(root='./data', train=False, transform=transform, download=False)
train_iter = DataLoader(dataset=train_data, shuffle=True, batch_size=batch_size, num_workers=4)
test_iter = DataLoader(dataset=test_data, shuffle=False, batch_size=batch_size, num_workers=4)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LeNet().to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

num_epochs = 10
for epoch in range(num_epochs):
loss = train(model, loss_function, device, optimizer, train_iter)
print(f'epoch {epoch + 1}, Loss {loss}')

print(f'Test Accuracy {test(model, device, test_iter)*100:.2f}%')

AlexNet

模型

相比于LeNet,AlexNet神经网络模型深度更深,有8层(5个卷积层和3个全连接层),而LeNet只有5层(2个卷积和3个全连接层)

AlexNet使用了ReLU作为激活函数,而LeNet使用的是Sigmoid激活函数

AlexNet在卷积层后使用了LRN,这种技术模仿了生物神经系统的“侧抑制”机制,但后来人们发现并没有什么作用

AlexNet在训练过程中使用了数据增强技术,如图像翻转、裁剪和颜色变换等,这些技术有效地减少了过拟合,提高了模型的泛化能力

AlexNet在全连接层使用了Dropout技术,这是一种正则化方法,可以在训练过程中随机丢弃一些神经元,以减少模型复杂度,防止过拟合

LeNet(左) AlexNet(右)

代码

由于ImageNet数据集太大,训练成本较高,这里我们选用CIFAR10数据集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time

class AlexNet(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 96, kernel_size=11, stride=4)
self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2)
self.conv2 = nn.Conv2d(96, 256, kernel_size=5, padding=2)
self.pool2 = nn.MaxPool2d(kernel_size=3, stride=2)
self.conv3 = nn.Conv2d(256, 384, kernel_size=3, padding=1)
self.conv4 = nn.Conv2d(384, 384, kernel_size=3, padding=1)
self.conv5 = nn.Conv2d(384, 256, kernel_size=3, padding=1)
self.pool3 = nn.MaxPool2d(kernel_size=3, stride=2)
self.flatten = nn.Flatten()
self.linear1 = nn.Linear(6400, 4096)
self.dropout = nn.Dropout(p=0.5)
self.linear2 = nn.Linear(4096, 4096)
self.linear3 = nn.Linear(4096, 10)

def forward(self, x):
x = self.pool1(F.relu(self.conv1(x)))
x = self.pool2(F.relu(self.conv2(x)))
x = F.relu(self.conv3(x))
x = F.relu(self.conv4(x))
x = F.relu(self.conv5(x))
x = self.pool3(x)
x = self.flatten(x)
x = F.relu(self.linear1(x))
x = self.dropout(x)
x = F.relu(self.linear2(x))
x = self.dropout(x)
x = self.linear3(x)

return x

def train(data_iter, model, device, loss_function, optimizer):
start_time = time.time()
model.train()
L = 0
num = len(data_iter.dataset)
for X, y in data_iter:
X, y = X.to(device), y.to(device)
y_hat = model(X)
loss = loss_function(y_hat, y)
num += len(y)
L += loss.item() * len(y)
optimizer.zero_grad()
loss.backward()
optimizer.step()

samples_per_second = num / (time.time() - start_time)
return (L / num, samples_per_second)

def test(data_iter, model, device):
model.eval()
with torch.no_grad():
num = len(data_iter.dataset)
right_num = 0
for X, y in data_iter:
X, y = X.to(device), y.to(device)
y_hat = model(X).argmax(dim=1)
right_num += (y_hat == y).sum().item()
return right_num / num

def init_weight(m):
if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
nn.init.zeros_(m.bias)

transform = transforms.Compose([
transforms.Resize(size=(224, 224)),
transforms.ToTensor()
])
batch_size = 256
train_data = datasets.CIFAR10(root='./data', transform=transform, train=True, download=False)
test_data = datasets.CIFAR10(root='./data', transform=transform, train=False, download=False)
train_iter = DataLoader(dataset=train_data, shuffle=True, batch_size=batch_size, num_workers=4)
test_iter = DataLoader(dataset=test_data, shuffle=False, batch_size=batch_size, num_workers=4)

net = AlexNet()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net.apply(init_weight)
net.to(device)

optimizer = optim.Adam(net.parameters(), lr=0.0001)
loss_function = nn.CrossEntropyLoss()

num_epochs = 10
for epoch in range(num_epochs):
loss, samples_per_second = train(train_iter, net, device, loss_function, optimizer)
print(f'epoch {epoch + 1}, Loss {loss:.5f}, {samples_per_second:.1f} examples/sec')

print(f'test accuracy {test(test_iter, net, device)*100:.2f}%')

VGG

模型

相比于AlexNet,VGG(Visual Geometry Group)统一使用\(3\times3\)的小卷积核和\(2\times2\)的池化层,网络结构更加简单规范

VGG将多个卷积层和一个池化层打包成一个模块,可以用模块的方式构建神经网络

原始VGG网络有5个卷积块,其中前两个块各有一个卷积层,后三个块各包含两个卷积层,第一个模块有64个输出通道,每个后续模块将输出通道数量翻倍,直到该数字达到512。由于该网络使用8个卷积层和3个全连接层,因此它通常被称为VGG-11

VGG块不通过卷积层减小图像的宽高,VGG的卷积层通常采用same卷积,保证输出的图像宽高尺寸和输入的图像宽高尺寸相同。VGG块通过最大池化层(最大汇聚层)减小图像的宽高

代码

使用函数将一个VGG块打包,包装到nn.Sequential()中,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time

class VGG11(nn.Module):
def __init__(self):
super().__init__()
self.block1 = self.__vgg_block(1, 3, 64)
self.block2 = self.__vgg_block(1, 64, 128)
self.block3 = self.__vgg_block(2, 128, 256)
self.block4 = self.__vgg_block(2, 256, 512)
self.block5 = self.__vgg_block(2, 512, 512)

self.flatten = nn.Flatten()
self.linear1 = nn.Linear(512*7*7, 4096)
self.dropout = nn.Dropout(p=0.5)
self.linear2 = nn.Linear(4096, 4096)
self.linear3 = nn.Linear(4096, 10)

def forward(self, x):
x = self.block1(x)
x = self.block2(x)
x = self.block3(x)
x = self.block4(x)
x = self.block5(x)
x = self.flatten(x)

x = F.relu(self.linear1(x))
x = self.dropout(x)
x = F.relu(self.linear2(x))
x = self.dropout(x)
x = self.linear3(x)

return x

def __vgg_block(self, num_convs, in_channels, out_channels):
layers = []
for _ in range(num_convs):
layers.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
layers.append(nn.ReLU())
in_channels = out_channels
layers.append(nn.MaxPool2d(kernel_size=2))

return nn.Sequential(*layers)

def train(data_iter, model, device, loss_function, optimizer):
start_time = time.time()
model.train()
num = len(data_iter.dataset)
L = 0
for X, y in data_iter:
X, y = X.to(device), y.to(device)
y_hat = model(X)
loss = loss_function(y_hat, y)
L += loss.item() * len(y)
optimizer.zero_grad()
loss.backward()
optimizer.step()

samples_per_second = num / (time.time() - start_time)
return (L / num, samples_per_second)

def test(data_iter, model, device):
model.eval()
with torch.no_grad():
num = len(data_iter.dataset)
right_num = 0
for X, y in data_iter:
X, y = X.to(device), y.to(device)
y_hat = model(X).argmax(dim=1)
right_num += (y_hat == y).sum().item()

return right_num / num


def init_weight(m):
if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
nn.init.zeros_(m.bias)

transform = transforms.Compose([
transforms.Resize(size=(224, 224)),
transforms.ToTensor()
])
batch_size = 128

train_data = datasets.CIFAR10(root='../AlexNet/data', train=True, transform=transform, download=False)
test_data = datasets.CIFAR10(root='../AlexNet/data', train=False, transform=transform, download=False)

train_iter = DataLoader(train_data, shuffle=True, batch_size=batch_size, num_workers=4)
test_iter = DataLoader(test_data, shuffle=False, batch_size=batch_size, num_workers=4)

net = VGG11()
net.apply(init_weight)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.0001)

num_epochs = 10
for epoch in range(num_epochs):
loss, samples_per_second = train(train_iter, net, device, loss_function, optimizer)
print(f'Epoch {epoch + 1}, Loss {loss:.5f}, {samples_per_second:.1f} examples/sec')

print(f'test accuracy {test(test_iter, net, device)*100:.2f}%')

NiN

模型

NiN(Network in Network),NiN引入了\(1\times1\)的卷积核,这种卷积核可以在不改变空间维度的情况下,增加网络的深度和非线性能力,同时减少参数数量。在网络的末端,NiN使用全局平均池化层来代替传统的全连接层。这种方法不仅减少了参数数量,还使得模型更加灵活,能够适应不同尺寸的输入。

\(1\times1\)卷积核实际上就是将每个像素当成一个样本,通道当成一个像素的特征,应用全连接层。

LeNet、AlexNet、VGG都有一个共同的设计模式:通过一系列的卷积层和池化层来提取空间特征结构,然后通过全连接层对特征的表征进行处理。然而,如果使用了全连接层,可能会完全放弃表征的空间结构。NiN提供了一个非常简单的解决方法:在每个像素的通道上分别使用多层感知机

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time

class NiN(nn.Module):
def __init__(self):
super().__init__()
self.nin1 = self.__nin_block(3, 96, kernel_size=11, stride=4)
self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2)
self.nin2 = self.__nin_block(96, 256, kernel_size=5, padding=1)
self.pool2 = nn.MaxPool2d(kernel_size=3, stride=2)
self.nin3 = self.__nin_block(256, 384, kernel_size=3, padding=1)
self.pool3 = nn.MaxPool2d(kernel_size=3, stride=2)
self.nin4 = self.__nin_block(384, 10, kernel_size=3, padding=1)
self.pool4 = nn.AdaptiveAvgPool2d(output_size=(1, 1))
self.flatten = nn.Flatten()

def forward(self, x):
x = self.pool1(self.nin1(x))
x = self.pool2(self.nin2(x))
x = self.pool3(self.nin3(x))
x = self.pool4(self.nin4(x))
return self.flatten(x)

def __nin_block(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
return nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding),
nn.ReLU(),
nn.Conv2d(out_channels, out_channels, 1),
nn.ReLU(),
nn.Conv2d(out_channels, out_channels, 1),
nn.ReLU()
)

def train(data_iter, model, device, loss_function, optimizer):
start_time = time.time()
model.train()
num = len(data_iter.dataset)
L = 0
for X, y in data_iter:
X, y = X.to(device), y.to(device)
y_hat = model(X)
loss = loss_function(y_hat, y)
L += loss.item() * len(y)
optimizer.zero_grad()
loss.backward()
optimizer.step()

samples_per_second = num / (time.time() - start_time)
return (L / num, samples_per_second)

def test(data_iter, model, device):
model.eval()
with torch.no_grad():
num = len(data_iter.dataset)
right_num = 0
for X, y in data_iter:
X, y = X.to(device), y.to(device)
y_hat = model(X).argmax(dim=1)
right_num += (y_hat == y).sum().item()

return right_num / num

def init_weight(m):
if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
nn.init.zeros_(m.bias)

transform = transforms.Compose([
transforms.Resize(size=(224, 224)),
transforms.ToTensor()
])
batch_size = 128

train_data = datasets.CIFAR10(root='../AlexNet/data', train=True, transform=transform, download=False)
test_data = datasets.CIFAR10(root='../AlexNet/data', train=False, transform=transform, download=False)

train_iter = DataLoader(train_data, shuffle=True, batch_size=batch_size, num_workers=4)
test_iter = DataLoader(test_data, shuffle=False, batch_size=batch_size, num_workers=4)

net = NiN()
net.apply(init_weight)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.0001)

num_epochs = 10
for epoch in range(num_epochs):
loss, samples_per_second = train(train_iter, net, device, loss_function, optimizer)
print(f'Epoch {epoch + 1}, Loss {loss:.5f}, {samples_per_second:.1f} examples/sec')

print(f'test accuracy {test(test_iter, net, device)*100:.2f}%')

GoogLeNet

模型

在GoogLeNet中,基本的卷积块被称为Inception块(Inception block)

Inception块的架构

GoogLeNet的核心是Inception模块。每个Inception模块都包含了多个并行的卷积层和池化层,它们以不同的方式处理输入数据,并将结果拼接起来。这种设计允许网络在多个尺度上捕获特征,增强了特征提取的能力。

Inception模块中广泛使用了1x1卷积,这种卷积可以减少特征图的深度,降低计算量,同时增加非线性。

GoogLeNet的一个重点是解决了什么样大小的卷积核最合适的问题,该论文的一个观点是有时使用不同大小的卷积核组合是有利的

Inception块由四条并行路线组成。前三条路径使用窗口大小为\(1\times1\)\(3\times3\)、和\(5\times5\)的卷积层,从不同空间大小中提取信息。中间两条路径在输入上执行\(1\times1\)卷积,以减少通道数,从而降低模型的复杂性。第四条路径使用\(3\times3\)最大池化层,然后使用\(1\times1\)卷积层来改变通道数。这四条路径都使用合适的填充来使输入与输出的高和宽一致,最后我们将每条线路的输出在通道维度上连结,并构成Inception块的输出。在Inception块中,通常调整的超参数是每层输出通道数。

GoogLeNet一共使用9个Inception块和全局平均池化层的堆叠来生成其估计值,Inception块之间的最大池化层可降低维度。第一个模块类似于AlexNet和LeNet,Inception块的组合从VGG继承,全局平均池化层避免了在最后使用全连接层

代码

先构建Inception块然后构建GoogLeNet

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import time

class Inception(nn.Module):
def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
super().__init__(**kwargs)

# 线路1,单1x1卷积层
self.p1 = nn.Conv2d(in_channels=in_channels, out_channels=c1, kernel_size=1)

# 线路2,1x1卷积层后接3x3卷积层
self.p2_1 = nn.Conv2d(in_channels=in_channels, out_channels=c2[0], kernel_size=1)
self.p2_2 = nn.Conv2d(in_channels=c2[0], out_channels=c2[1], kernel_size=3, padding=1)

# 线路3,1x1卷积层后接5x5卷积层
self.p3_1 = nn.Conv2d(in_channels=in_channels, out_channels=c3[0], kernel_size=1)
self.p3_2 = nn.Conv2d(in_channels=c3[0], out_channels=c3[1], kernel_size=3, padding=1)

# 线路4,3x3最大池化层后接1x1卷积层
self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
self.p4_2 = nn.Conv2d(in_channels=in_channels, out_channels=c4, kernel_size=1)

def forward(self, x):
p1 = F.relu(self.p1(x))
p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))
p3 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
p4 = F.relu(self.p4_2(self.p4_1(x)))

return torch.cat((p1, p2, p3, p4), dim=1)

class GoogLeNet(nn.Module):
def __init__(self):
super().__init__()
self.b1 = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=3),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
self.b2 = nn.Sequential(
nn.Conv2d(in_channels=64, out_channels=64, kernel_size=1),
nn.ReLU(),
nn.Conv2d(in_channels=64, out_channels=192, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
self.b3 = nn.Sequential(
Inception(192, 64, (96, 128), (16, 32), 32),
Inception(256, 128, (128, 192), (32, 96), 64),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
self.b4 = nn.Sequential(
Inception(480, 192, (96, 208), (16, 48), 64),
Inception(512, 160, (112, 224), (24, 64), 64),
Inception(512, 128, (128, 256), (24, 64), 64),
Inception(512, 112, (144, 288), (32, 64), 64),
Inception(528, 256, (160, 320), (32, 128), 128),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
self.b5 = nn.Sequential(
Inception(832, 256, (160, 320), (32, 128), 128),
Inception(832, 384, (192, 384), (48, 128), 128),
nn.AdaptiveAvgPool2d(output_size=(1, 1)),
nn.Flatten()
)
self.linear = nn.Linear(1024, 10)

def forward(self, x):
x = self.b1(x)
x = self.b2(x)
x = self.b3(x)
x = self.b4(x)
x = self.b5(x)
return self.linear(x)

def train(data_iter, model, device, loss_function, optimizer):
start_time = time.time()
model.train()
num = len(data_iter.dataset)
L = 0
for X, y in data_iter:
X, y = X.to(device), y.to(device)
y_hat = model(X)
loss = loss_function(y_hat, y)
L += loss.item() * len(y)
optimizer.zero_grad()
loss.backward()
optimizer.step()

return (L / num, num / (time.time() - start_time))

def test(data_iter, model, device):
model.eval()
num = len(data_iter.dataset)
right_num = 0
with torch.no_grad():
for X, y in data_iter:
X, y = X.to(device), y.to(device)
y_hat = model(X).argmax(dim=1)
right_num += (y_hat == y).sum().item()
return right_num / num

def init_weight(m):
if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
nn.init.kaiming_normal_(m.weight)
nn.init.zeros_(m.bias)

transform = transforms.Compose([
transforms.Resize(size=(224, 224)),
transforms.ToTensor()
])
batch_size = 256

train_data = datasets.CIFAR10(root='./AlexNet/data', transform=transform, train=True, download=False)
test_data = datasets.CIFAR10(root='./AlexNet/data', transform=transform, train=False, download=False)

train_iter = DataLoader(dataset=train_data, shuffle=True, batch_size=batch_size, num_workers=4)
test_iter = DataLoader(dataset=test_data, shuffle=False, batch_size=batch_size, num_workers=4)

net = GoogLeNet()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net.apply(init_weight)
net.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.0002)

num_epochs = 10

for epoch in range(num_epochs):
loss, samples_per_second = train(train_iter, net, device, loss_function, optimizer)
print(f'Epoch {epoch + 1}, Loss {loss:.5f}, {samples_per_second:.2f} examples/sec')


print(f'Test accuracy {test(test_iter, net, device)*100:.2f}%')

ResNet

模型

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import torch
import torch.nn as nn
import torch.nn.functional as F

# 定义一个基本的残差块
class BasicBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super().__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm(out_channels)
self.downsample = downsample

def forward(self, x):
identity = x

out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)

out = self.conv2(out)
out = self.bn2(out)

if self.downsample is not None:
identity = self.downsample(x)

out += identity
out = self.relu(out)

return out

# 定义一个Bottlen残差块
class Bottleneck(nn.Module):
expansion = 4

def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super().__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.conv3 = nn.Conv2d(out_channels, out_channels*self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(out_channels*self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample

def forward(self, x):
identity = x

out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)

out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)

out = self.conv3(out)
out = self.bn3(out)

if self.downsample is not None:
identity = self.downsample(x)

out += identity
out = self.relu(out)

return out

# 定义ResNet模型
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
super().__init__()
self.in_channels = 64
# 初始的卷积层,将3通道的图像转换为64通道
self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(self.in_channels)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)


def _make_layer(self, block, out_channels, blocks, stride=1):
downsample = None
# 如果步长不为1或者输入通道数不等于输出通道数,需要下采样
if stride != 1 or self.in_channels != out_channels * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.in_channels, out_channels * block.expansion)
)

经典的卷积神经网络
https://blog.shinebook.net/2025/03/09/人工智能/pytorch/经典的卷积神经网络/
作者
X
发布于
2025年3月9日
许可协议