用 python 从零 手动实现 简单的 深度学习框架(乞丐版)。从Tensor的实现,到 MLP 的构建,手写损失函数、随机梯度下降算法,能够实现基础 NN 网络的自动微分和反向传播,跑通MNIST
☞☞☞AI 智能聊天, 问答助手, AI 智能搜索, 免费无限量使用 DeepSeek R1 模型☜☜☜

除了random(用于生成随机数,初始化网络参数) 无需 import 任何库
这里的 Tensor 是极简版,只相当于paddle.Tensor的单元素,只有最基础、最重要的功能:加、乘、ReLU、反向传播
在这里定义:如果一个节点是由其他节点计算得出,则这些其他节点为该节点的父节点;例如 c = a + b, 则 a,b 为c 的父节点
节点记录自己的父节点,是为了反向传播计算所有节点梯度
class Tensor:
def __init__(self,data,_parents=()):
self.data = data # 节点的值
self.grad = 0 # 梯度
self._backward = lambda: None # 初始化反向传播,等效为一个空函数
self._parents = set(_parents) # 节点的父节点
def __repr__(self): # 优化输出
return f'Tensor(data={self.data})'
def __add__(self, other): # 实现加法运算
out = Tensor(self.data + other.data, (self,other)) def _backward():
self.grad += 1.0 * out.grad
other.grad += 1.0 * out.grad
out._backward = _backward # 定义输出节点的反向传播函数
return out
def __mul__(self, other): # 实现乘法运算
out = Tensor(self.data * other.data, (self,other)) def _backward():
self.grad += other.data * out.grad
other.grad += self.data * out.grad
out._backward = _backward return out
def relu(self): # 实现 ReLu 运算, 激活函数
out = Tensor(0 if self.data<0 else self.data, (self,)) def _backward():
self.grad += (self.data>0)*out.grad
out._backward = _backward return out def backward(self): # 实现所有节点的反向传播,计算梯度
# 广搜 确定节点的拓扑排序
topo = []
vis = set() def build_topo(v):
if v not in vis:
v.grad = 0.0
vis.add(v)
topo.append(v) for parent in v._parents:
build_topo(parent)
build_topo(self) # 反向传播
self.grad = 1.0
for v in topo:
v._backward()举个栗子:z = 2x+y 计算 x 对于 z 的梯度,即 z 关于 x 的偏导,很容易得到 x.grad = 2
再加一条:x = 3a + b 计算 a 对于 z 的梯度。求复合函数的导数,我们需要用到链式法则
∂a∂z=∂x∂z⋅∂a∂x
所以:a.grad = 3 * x.grad = 6 解释了为什么求节点梯度时需要乘上out.grad
下面解释为什么梯度是用 +=
同样举个栗子:z = x + x, 此时 x.grad = 2 如果使用 self.grad = 1.0 * out.grad,则会得到 x.grad = 1 的错误结果
ReLU为分段函数,在求梯度时进行分段讨论即可
# 测试梯度求解a = Tensor(2) b = Tensor(3) x = a*b z = x + x z.grad = 1 # 由于求解父节点的梯度依赖于子节点梯度,所以应该讲 z.grad 初始化为 1z._backward() # 求 z 父节点 x 的梯度x._backward() # 求 x 父节点 a,b 的梯度a.grad, b.grad, x.grad, z.grad
(6.0, 4.0, 2.0, 1)
# 测试 backward()a = Tensor(2) b = Tensor(3) x = a*b z = x + x z.grad = 1 # z.grad 初始化为 1z.backward() a.grad, b.grad, x.grad, z.grad
(6.0, 4.0, 2.0, 1.0)
相当于一个矩阵乘法和一个矩阵加法,输入一个一维矩阵,乘上权重矩阵 w,再加上一个偏置矩阵 b
import randomclass Linear:
def __init__(self, in_features, out_features):
self.in_features = in_features
self.out_features = out_features
self.w = [[Tensor(random.random())] * out_features for _ in range(in_features)]
self.b = [Tensor(random.random())] * out_features
def __call__(self, x):
return self.forward(x)
def forward(self, x):
# 手动实现矩阵乘法
out = [Tensor(0.0)] * self.out_features for i in range(self.out_features):
out[i] = out[i]+self.b[i] for j in range(self.in_features):
out[i] = out[i] + x[j] * self.w[j][i] return out
def parameters(self): # 获得 Linear 的所有参数,可用于参数更新
return [self.w, self.b]# 测试 Linearnet = Linear(2,1) x = [Tensor(1.0)] * 2net(x)
[Tensor(data=1.7437665109884302)]
这里采用均方损失
比较真实值和预估值,例如房屋售价和估价
假设y是真实值,y是估计值,我们可以比较
l(y,y)=21(y−y)2
这个叫做平方损失(或均方损失)
def squared_loss(y_hat, y):
"""均方损失"""
loss = Tensor(0.0) for i in range(len(y)):
tmp = y_hat[i] + y[i]*Tensor(-1.0)
tmp = tmp*tmp*Tensor(0.5)
loss = loss + tmp return loss这里采用随机梯度下降
def sgd(params, lr):
"""随机梯度下降"""
if isinstance(params, list): for i in params:
sgd(i, lr) elif isinstance(params, Tensor):
params.data -= params.grad*lr
params.grad = 0.0import numpy as np # 与框架关系不大,后面手写字数据集用def to_tensor(x):
"""将 list/numpy.ndarray 中的每个元素,转换成 Tensor"""
ans = [] if isinstance(x, list) or isinstance(x, np.ndarray): for i in range(len(x)):
ans.append(to_tensor(x[i])) else:
ans = Tensor(x) return ans# 准备数据X = [[1,2,3],[2,3,7],[5,3,1]]
Y = [[x_[0] + 2*x_[1] + 4*x_[2]] for x_ in X] # y = [[17, 36, 15]]X = to_tensor(X)
Y = to_tensor(Y)for i in range(len(X)): print(X[i],Y[i])[Tensor(data=1), Tensor(data=2), Tensor(data=3)] [Tensor(data=17)] [Tensor(data=2), Tensor(data=3), Tensor(data=7)] [Tensor(data=36)] [Tensor(data=5), Tensor(data=3), Tensor(data=1)] [Tensor(data=15)]
lr = 0.03 # 学习率num_epochs = 3 # 迭代次数net = Linear(3,1) # 构建神经网络loss = squared_loss # 设置损失函数for epoch in range(num_epochs): for i in range(len(X)):
l = loss(net(X[i]),Y[i]) # X 和 y 的损失
l.backward()
params = net.parameters()
sgd(params, lr)
train_l = 0.0
for i in range(len(X)):
train_l += loss(net(X[i]),Y[i]).data print(f'epoch{epoch + 1}, loss {float(train_l/len(X)):f}')
net(X[0]), Y[0]epoch2, loss 5.200388 epoch2, loss 0.407041 epoch3, loss 0.041877
([Tensor(data=17.498340752751304)], [Tensor(data=17)])
多个 Linear 组合
class MLP:
def __init__(self, in_features, outs):
self.linears = []
self.num_linears = len(outs) for i in range(len(outs)): # 根据 outs 构建多个 Linear 层,上一层 Linear 的 out_features, 为下一层 Linear 的 in_features
self.linears.append(Linear(in_features, outs[i]))
in_features = outs[i]
def __call__(self, x):
return self.forward(x)
def forward(self, x):
for i in range(self.num_linears-1):
x = self.linears[i](x) for j in x:
j.relu() # 每一个 Linear 层后加一个 ReLU 激活函数
x = self.linears[-1](x) return x
def parameters(self):
return [p for linear in self.linears for p in linear.parameters()]# 准备数据X = [[1,2,3],[2,3,7],[5,3,1]] Y = [[x_[0] + 2*x_[1] + 4*x_[2]] for x_ in X] # y = [[17, 36, 15]]X = to_tensor(X) Y = to_tensor(Y)for i in range(len(X)): print(X[i],Y[i])
[Tensor(data=1), Tensor(data=2), Tensor(data=3)] [Tensor(data=17)] [Tensor(data=2), Tensor(data=3), Tensor(data=7)] [Tensor(data=36)] [Tensor(data=5), Tensor(data=3), Tensor(data=1)] [Tensor(data=15)]
lr = 0.001num_epochs = 10net = MLP(3,[2,1])
loss = squared_lossfor epoch in range(num_epochs): for i in range(len(X)):
l = loss(net(X[i]),Y[i]) # X 和 y 的损失
l.backward()
params = net.parameters()
sgd(params, lr)
train_l = 0.0
for i in range(len(X)):
train_l += loss(net(X[i]),Y[i]).data print(f'epoch{epoch + 1}, loss {float(train_l/len(X)):f}')
net(X[0]), Y[0]epoch2, loss 184.394965 epoch2, loss 95.322663 epoch3, loss 30.901516 epoch4, loss 10.462304 epoch5, loss 5.934934 epoch6, loss 4.167137 epoch7, loss 3.059523 epoch8, loss 2.275400 epoch9, loss 1.705741 epoch20, loss 1.289367
([Tensor(data=17.56170670010426)], [Tensor(data=17)])
!mkdir -p /home/aistudio/work/mnist !unzip /home/aistudio/data/data33695/mnist.zip -d /home/aistudio/work/mnist/
Archive: /home/aistudio/data/data33695/mnist.zip inflating: /home/aistudio/work/mnist/t10k-images.idx3-ubyte inflating: /home/aistudio/work/mnist/t10k-labels.idx1-ubyte inflating: /home/aistudio/work/mnist/train-images.idx3-ubyte inflating: /home/aistudio/work/mnist/train-labels.idx1-ubyte
import sys
sys.path.append('/home/aistudio/work')import load_MNISTimport matplotlib.pyplot as pltdef load_datasets(show_examples=False):
X_train = load_MNIST.load_train_images()
y_train = load_MNIST.load_train_labels()
X_test = load_MNIST.load_test_images()
y_test = load_MNIST.load_test_labels() if show_examples is True:
sample = X_train[1, :, :]
plt.imshow(sample)
plt.show() print('样例的矩阵形式为:\n {}'.format(sample)) return X_train, X_test, y_train, y_test
X_train_, X_test_, y_train_, y_test_ = load_datasets(show_examples=True)开始载入MNIST手写数字数据集: 训练集图片大小: 28*28, 已载入60000/60000. 训练集标签数量: 60000...已完成。 测试集图片大小: 28*28, 已载入10000/10000. 测试集标签数量: 10000...已完成。
<Figure size 640x480 with 1 Axes>
样例的矩阵形式为:
[[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 51. 159. 253. 159. 50. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
48. 238. 252. 252. 252. 237. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 54.
227. 253. 252. 239. 233. 252. 57. 6. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 10. 60. 224.
252. 253. 252. 202. 84. 252. 253. 122. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 163. 252. 252.
252. 253. 252. 252. 96. 189. 253. 167. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 51. 238. 253. 253.
190. 114. 253. 228. 47. 79. 255. 168. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 48. 238. 252. 252. 179.
12. 75. 121. 21. 0. 0. 253. 243. 50. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 38. 165. 253. 233. 208. 84.
0. 0. 0. 0. 0. 0. 253. 252. 165. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 7. 178. 252. 240. 71. 19. 28.
0. 0. 0. 0. 0. 0. 253. 252. 195. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 57. 252. 252. 63. 0. 0. 0.
0. 0. 0. 0. 0. 0. 253. 252. 195. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 198. 253. 190. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 255. 253. 196. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 76. 246. 252. 112. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 253. 252. 148. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 85. 252. 230. 25. 0. 0. 0. 0.
0. 0. 0. 0. 7. 135. 253. 186. 12. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 85. 252. 223. 0. 0. 0. 0. 0.
0. 0. 0. 7. 131. 252. 225. 71. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 85. 252. 145. 0. 0. 0. 0. 0.
0. 0. 48. 165. 252. 173. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 86. 253. 225. 0. 0. 0. 0. 0.
0. 114. 238. 253. 162. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 85. 252. 249. 146. 48. 29. 85. 178.
225. 253. 223. 167. 56. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 85. 252. 252. 252. 229. 215. 252. 252.
252. 196. 130. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 28. 199. 252. 252. 253. 252. 252. 233.
145. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 25. 128. 252. 253. 252. 141. 37.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]X_train_.shape, y_train_.shape,type(X_train_),X_test_.shape, y_test_.shape
((60000, 28, 28), (60000,), numpy.ndarray, (10000, 28, 28), (10000,))
# 数据处理X_train = X_train_.reshape(60000,28*28) y_train = y_train_.reshape(60000,1) X_test = X_test_.reshape(10000,28*28) y_test = y_test_.reshape(10000,1) X_train[0],y_train[0]# 这个框架太粗糙,训练太慢,只使用部分数据train_data = 1000test_data =10X_train = to_tensor(X_train[0:train_data]) y_train = to_tensor(y_train[0:train_data]) X_test = to_tensor(X_test[0:test_data]) y_test = to_tensor(y_test[0:test_data])
该框架太过于粗糙,有待优化。这里只使用部分数据训练,用作演示
lr = 0.00000001num_epochs = 2net = Linear(784,10)# net = MLP(28*28, [256,10])loss = squared_lossfor epoch in range(num_epochs): for i in range(train_data):
l = loss(net(X_train[i]),y_train[i]) # X 和 y 的损失
l.backward()
params = net.parameters()
sgd(params, lr) if i%100 == 0: print(f'epoch{epoch + 1}, train loss {float(l.data):f}')
test_l = 0.0
for i in range(test_data):
test_l += loss(net(X_test[i]),y_test[i]).data print(f'epoch{epoch + 1}, test loss {float(test_l/len(X_test)):f}')# net(X_train[0]), y_train[0]epoch2, train loss 93719477.784360 epoch2, train loss 83988.257223 epoch2, train loss 344.078848 epoch2, train loss 219084.742202 epoch2, train loss 1874046.072795 epoch2, train loss 11194.016174 epoch2, train loss 85165.438772 epoch2, train loss 6325.009070 epoch2, train loss 152919.084357 epoch2, train loss 55067.312630 epoch2, test loss 330436.891082 epoch2, train loss 260784.575575 epoch2, train loss 15026.502161 epoch2, train loss 81925.278769 epoch2, train loss 40676.119289 epoch2, train loss 1202164.376063 epoch2, train loss 23028.411272 epoch2, train loss 20343.179800 epoch2, train loss 18186.245439 epoch2, train loss 15613.012288 epoch2, train loss 15918.717633 epoch2, test loss 223290.439788
使用一个全连接层,粗略训练
import paddleimport paddle.nn as nn# 数据准备X_train_p = paddle.to_tensor(X_train_, stop_gradient=True).flatten(1).astype('float32')
X_test_p = paddle.to_tensor(X_test_, stop_gradient=True).flatten(1).astype('float32')
y_train_p = paddle.to_tensor(y_train_, stop_gradient=True).astype('float32')
y_test_p = paddle.to_tensor(y_test_, stop_gradient=True).astype('float32')
batch_size, lr, num_epochs = 256, 0.1, 10loss = paddle.nn.loss.MSELoss()
net = nn.Linear(28*28, 1) # 一个简单的全连接层sgd = paddle.optimizer.SGD(learning_rate=0.0000001, parameters=net.parameters())for epoch in range(num_epochs): for i in range(60000):
l = loss(net(X_train_p[i]),y_train_p[i]) # X 和 y 的损失
l.backward()
sgd.step()
sgd.clear_grad() if i%30000 == 0: print(f'epoch{epoch + 1}, train loss {float(l.value()):f}')
test_l = 0.0
for i in range(1000):
test_l += loss(net(X_test_p[i]),y_test_p[i]).value() print(f'epoch{epoch + 1}, test loss {float(test_l/10000):f}')# 粗略训练结果net(X_train_p[0]), y_train_p[0]epoch2, train loss 3617.558350 epoch2, train loss 36.344646 epoch2, test loss 2.478513 epoch2, train loss 22.809593 epoch2, train loss 0.706598 epoch2, test loss 1.734603 epoch3, train loss 4.005621 epoch3, train loss 0.181211 epoch3, test loss 1.424313 epoch4, train loss 0.791444 epoch4, train loss 0.112583 epoch4, test loss 1.256442 epoch5, train loss 0.059686 epoch5, train loss 0.095287 epoch5, test loss 1.151413 epoch6, train loss 0.034223 epoch6, train loss 0.085789 epoch6, test loss 1.080396 epoch7, train loss 0.241043 epoch7, train loss 0.075794 epoch7, test loss 1.030066 epoch8, train loss 0.513496 epoch8, train loss 0.064292 epoch8, test loss 0.993136 epoch9, train loss 0.784249 epoch9, train loss 0.052085 epoch9, test loss 0.965241 epoch20, train loss 1.025587 epoch20, train loss 0.040221 epoch20, test loss 0.943633
(Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=False,
[3.89197564]),
Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
[5.]))以上就是动手写深度学习框架(一)的详细内容,更多请关注php中文网其它相关文章!
每个人都需要一台速度更快、更稳定的 PC。随着时间的推移,垃圾文件、旧注册表数据和不必要的后台进程会占用资源并降低性能。幸运的是,许多工具可以让 Windows 保持平稳运行。
Copyright 2014-2025 https://www.php.cn/ All Rights Reserved | php.cn | 湘ICP备2023035733号