pytorch-分布式训练

雨点打透心脏的1/2处 2023-01-22 10:58 57阅读 0赞
  1. # -*- encoding: utf-8 -*-
  2. """
  3. @File : DP.py
  4. @Time : 2021/5/19 3:03 下午
  5. @Author : Johnson
  6. https://www.aiuai.cn/aifarm1764.html
  7. """
  8. import torch
  9. import torch.nn as nn
  10. from torch.utils.data import DataLoader,Dataset
  11. #parameters and Dataloaders
  12. input_size = 5
  13. output_size = 2
  14. batch_size = 30
  15. data_size = 100
  16. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  17. class RandomDataset(Dataset):
  18. def __init__(self,size,length):
  19. self.len = length
  20. self.data = torch.randn(length,size)
  21. def __getitem__(self, item):
  22. return self.data[item]
  23. def __len__(self):
  24. return self.len
  25. rand_loader = DataLoader(dataset=RandomDataset(input_size,data_size),
  26. batch_size=batch_size,shuffle=True
  27. )
  28. class Model(nn.Module):
  29. def __init__(self,input_size,output_size):
  30. super(Model, self).__init__()
  31. self.fc = nn.Linear(input_size,output_size)
  32. def forward(self,input):
  33. output = self.fc(input)
  34. return output
  35. model = Model(input_size,output_size)
  36. if torch.cuda.device_count()>1:
  37. print(torch.cuda.device_count())
  38. model = nn.DataParallel(model) #并行
  39. model.to(device)
  40. #训练
  41. for data in rand_loader:
  42. input = data.to(device)
  43. output = model(input)
  44. print("outside:input size",input.size(),"output_size",output.size())

#

  1. # -*- encoding: utf-8 -*-
  2. """
  3. @File : DP.py
  4. @Time : 2021/5/19 3:03 下午
  5. @Author : Johnson
  6. https://www.aiuai.cn/aifarm1764.html
  7. https://mp.weixin.qq.com/s/N8jlsrDy1mho1HsNH5GBjA
  8. """
  9. ## 单机单卡
  10. import torch
  11. import torch
  12. import torch.nn as nn
  13. import torchvision
  14. import torchvision.transforms as transforms
  15. BATCH_SIZE = 256
  16. EPOCHS = 5
  17. if __name__ == "__main__":
  18. # 1. define network
  19. device = "cuda"
  20. net = torchvision.models.resnet18(num_classes=10)
  21. net = net.to(device=device)
  22. # 2. define dataloader
  23. trainset = torchvision.datasets.CIFAR10(
  24. root="./data",
  25. train=True,
  26. download=True,
  27. transform=transforms.Compose(
  28. [
  29. transforms.RandomCrop(32, padding=4),
  30. transforms.RandomHorizontalFlip(),
  31. transforms.ToTensor(),
  32. transforms.Normalize(
  33. (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)
  34. ),
  35. ]
  36. ),
  37. )
  38. train_loader = torch.utils.data.DataLoader(
  39. trainset,
  40. batch_size=BATCH_SIZE,
  41. shuffle=True,
  42. num_workers=4,
  43. pin_memory=True,
  44. )
  45. # 3. define loss and optimizer
  46. criterion = nn.CrossEntropyLoss()
  47. optimizer = torch.optim.SGD(
  48. net.parameters(),
  49. lr=0.01,
  50. momentum=0.9,
  51. weight_decay=0.0001,
  52. nesterov=True,
  53. )
  54. print(" ======= Training ======= \n")
  55. # 4. start to train
  56. net.train()
  57. for ep in range(1, EPOCHS + 1):
  58. train_loss = correct = total = 0
  59. for idx, (inputs, targets) in enumerate(train_loader):
  60. inputs, targets = inputs.to(device), targets.to(device)
  61. outputs = net(inputs)
  62. loss = criterion(outputs, targets)
  63. optimizer.zero_grad()
  64. loss.backward()
  65. optimizer.step()
  66. train_loss += loss.item()
  67. total += targets.size(0)
  68. correct += torch.eq(outputs.argmax(dim=1), targets).sum().item()
  69. if (idx + 1) % 50 == 0 or (idx + 1) == len(train_loader):
  70. print(
  71. " == step: [{:3}/{}] [{}/{}] | loss: {:.3f} | acc: {:6.3f}%".format(
  72. idx + 1,
  73. len(train_loader),
  74. ep,
  75. EPOCHS,
  76. train_loss / (idx + 1),
  77. 100.0 * correct / total,
  78. )
  79. )
  80. print("\n ======= Training Finished ======= \n")
  81. ## 单机多卡DP
  82. import torch
  83. import torch.nn as nn
  84. import torchvision
  85. import torchvision.transforms as transforms
  86. BATCH_SIZE = 256
  87. EPOCHS = 5
  88. if __name__ == "__main__":
  89. # 1. define network
  90. device = "cuda"
  91. net = torchvision.models.resnet18(pretrained=False, num_classes=10)
  92. net = net.to(device=device)
  93. # Use single-machine multi-GPU DataParallel,
  94. # you would like to speed up training with the minimum code change.
  95. net = nn.DataParallel(net)
  96. # 2. define dataloader
  97. trainset = torchvision.datasets.CIFAR10(
  98. root="./data",
  99. train=True,
  100. download=True,
  101. transform=transforms.Compose(
  102. [
  103. transforms.RandomCrop(32, padding=4),
  104. transforms.RandomHorizontalFlip(),
  105. transforms.ToTensor(),
  106. transforms.Normalize(
  107. (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)
  108. ),
  109. ]
  110. ),
  111. )
  112. train_loader = torch.utils.data.DataLoader(
  113. trainset,
  114. batch_size=BATCH_SIZE,
  115. shuffle=True,
  116. num_workers=4,
  117. pin_memory=True,
  118. )
  119. # 3. define loss and optimizer
  120. criterion = nn.CrossEntropyLoss()
  121. optimizer = torch.optim.SGD(
  122. net.parameters(),
  123. lr=0.01,
  124. momentum=0.9,
  125. weight_decay=0.0001,
  126. nesterov=True,
  127. )
  128. print(" ======= Training ======= \n")
  129. # 4. start to train
  130. net.train()
  131. for ep in range(1, EPOCHS + 1):
  132. train_loss = correct = total = 0
  133. for idx, (inputs, targets) in enumerate(train_loader):
  134. inputs, targets = inputs.to(device), targets.to(device)
  135. outputs = net(inputs)
  136. loss = criterion(outputs, targets)
  137. optimizer.zero_grad()
  138. loss.backward()
  139. optimizer.step()
  140. train_loss += loss.item()
  141. total += targets.size(0)
  142. correct += torch.eq(outputs.argmax(dim=1), targets).sum().item()
  143. if (idx + 1) % 50 == 0 or (idx + 1) == len(train_loader):
  144. print(
  145. " == step: [{:3}/{}] [{}/{}] | loss: {:.3f} | acc: {:6.3f}%".format(
  146. idx + 1,
  147. len(train_loader),
  148. ep,
  149. EPOCHS,
  150. train_loss / (idx + 1),
  151. 100.0 * correct / total,
  152. )
  153. )
  154. print("\n ======= Training Finished ======= \n")
  155. ## 多机多卡DDP
  156. '''
  157. 进程组的概念:
  158. group:进程组,大部分情况下DDP的各个进程是在同一个进程组下
  159. world size:总的进程数量(原则上一个process占用一个GPU是最优的)
  160. rank:当前进程的序号,用于进程间通讯,rank=0的主机master节点
  161. local_rank:当前进程对应的GPU号
  162. 举个栗子 :4台机器(每台机器8张卡)进行分布式训练, 通过 init_process_group() 对进程组进行初始化, 初始化后 可以通过 get_world_size() 获取到 world size,在该例中为32, 即有32个进程,其编号为0-31, 通过 get_rank() 函数可以进行获取 在每台机器上,local rank均为0-8,这是 local rank 与 rank 的区别, local rank 会对应到实际的GPU ID上 (单机多任务的情况下注意CUDA_VISIBLE_DEVICES的使用,控制不同程序可见的GPU device)。
  163. '''
  164. import os
  165. import torch
  166. import torch.distributed as dist
  167. import torch.nn as nn
  168. import torchvision
  169. import torchvision.transforms as transforms
  170. from torch.nn.parallel import DistributedDataParallel as DDP
  171. from torch.utils.data.distributed import DistributedSampler
  172. BATCH_SIZE = 256
  173. EPOCHS = 5
  174. if __name__ == "__main__":
  175. # 0. set up distributed device
  176. rank = int(os.environ["RANK"])
  177. local_rank = int(os.environ["LOCAL_RANK"])
  178. torch.cuda.set_device(rank % torch.cuda.device_count())
  179. dist.init_process_group(backend="nccl")
  180. device = torch.device("cuda", local_rank)
  181. print(f"[init] == local rank: {local_rank}, global rank: {rank} ==")
  182. # 1. define network
  183. net = torchvision.models.resnet18(pretrained=False, num_classes=10)
  184. net = net.to(device)
  185. # DistributedDataParallel
  186. net = DDP(net, device_ids=[local_rank], output_device=local_rank)
  187. # 2. define dataloader
  188. trainset = torchvision.datasets.CIFAR10(
  189. root="./data",
  190. train=True,
  191. download=False,
  192. transform=transforms.Compose(
  193. [
  194. transforms.RandomCrop(32, padding=4),
  195. transforms.RandomHorizontalFlip(),
  196. transforms.ToTensor(),
  197. transforms.Normalize(
  198. (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)
  199. ),
  200. ]
  201. ),
  202. )
  203. # DistributedSampler
  204. # we test single Machine with 2 GPUs
  205. # so the [batch size] for each process is 256 / 2 = 128
  206. train_sampler = torch.utils.data.distributed.DistributedSampler(
  207. trainset,
  208. shuffle=True,
  209. )
  210. train_loader = torch.utils.data.DataLoader(
  211. trainset,
  212. batch_size=BATCH_SIZE,
  213. num_workers=4,
  214. pin_memory=True,
  215. sampler=train_sampler,
  216. )
  217. # 3. define loss and optimizer
  218. criterion = nn.CrossEntropyLoss()
  219. optimizer = torch.optim.SGD(
  220. net.parameters(),
  221. lr=0.01 * 2,
  222. momentum=0.9,
  223. weight_decay=0.0001,
  224. nesterov=True,
  225. )
  226. if rank == 0:
  227. print(" ======= Training ======= \n")
  228. # 4. start to train
  229. net.train()
  230. for ep in range(1, EPOCHS + 1):
  231. train_loss = correct = total = 0
  232. # set sampler
  233. train_loader.sampler.set_epoch(ep)
  234. for idx, (inputs, targets) in enumerate(train_loader):
  235. inputs, targets = inputs.to(device), targets.to(device)
  236. outputs = net(inputs)
  237. loss = criterion(outputs, targets)
  238. optimizer.zero_grad()
  239. loss.backward()
  240. optimizer.step()
  241. train_loss += loss.item()
  242. total += targets.size(0)
  243. correct += torch.eq(outputs.argmax(dim=1), targets).sum().item()
  244. if rank == 0 and ((idx + 1) % 25 == 0 or (idx + 1) == len(train_loader)):
  245. print(
  246. " == step: [{:3}/{}] [{}/{}] | loss: {:.3f} | acc: {:6.3f}%".format(
  247. idx + 1,
  248. len(train_loader),
  249. ep,
  250. EPOCHS,
  251. train_loss / (idx + 1),
  252. 100.0 * correct / total,
  253. )
  254. )
  255. if rank == 0:
  256. print("\n ======= Training Finished ======= \n")

发表评论

表情:
评论列表 (有 0 条评论,57人围观)

还没有评论,来说两句吧...

相关阅读