-
Notifications
You must be signed in to change notification settings - Fork 12
/
comm_helpers.py
91 lines (80 loc) · 2.85 KB
/
comm_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import collections
import logging
import math
import sys
import copy
import torch
import torch.distributed as dist
import functools
def flatten_tensors(tensors):
"""
Reference: https://github.com/facebookresearch/stochastic_gradient_push
Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
same dense type.
Since inputs are dense, the resulting tensor will be a concatenated 1D
buffer. Element-wise operation on this buffer will be equivalent to
operating individually.
Arguments:
tensors (Iterable[Tensor]): dense tensors to flatten.
Returns:
A 1D buffer containing input tensors.
"""
if len(tensors) == 1:
return tensors[0].view(-1).clone()
flat = torch.cat([t.view(-1) for t in tensors], dim=0)
return flat
def unflatten_tensors(flat, tensors):
"""
Reference: https://github.com/facebookresearch/stochastic_gradient_push
View a flat buffer using the sizes of tensors. Assume that tensors are of
same dense type, and that flat is given by flatten_dense_tensors.
Arguments:
flat (Tensor): flattened dense tensors to unflatten.
tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
unflatten flat.
Returns:
Unflattened dense tensors with sizes same as tensors and values from
flat.
"""
outputs = []
offset = 0
for tensor in tensors:
numel = tensor.numel()
outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
offset += numel
return tuple(outputs)
def communicate(tensors, communication_op, attention=False):
"""
Reference: https://github.com/facebookresearch/stochastic_gradient_push
Communicate a list of tensors.
Arguments:
tensors (Iterable[Tensor]): list of tensors.
communication_op: a method or partial object which takes a tensor as
input and communicates it. It can be a partial object around
something like torch.distributed.all_reduce.
"""
flat_tensor = flatten_tensors(tensors)
communication_op(tensor=flat_tensor)
if attention:
return tensors/flat_tensor
for f, t in zip(unflatten_tensors(flat_tensor, tensors), tensors):
t.set_(f)
def SyncAllreduce(model, rank, size):
'''
Inputs:
model: (x^i) local neural net model at i-th worker node
anchor_model: (z^1=z^2=...=z^m=z) local copy of auxiliary variable
rank: (i) worker index
size: (m) total number of workers
group: worker group
Output:
return void, change in-place
Formula:
x_new = sum_i x_i / size
'''
communication_op = functools.partial(dist.all_reduce)
params_list = []
for param in model.parameters():
param.data.div_(float(size))
params_list.append(param.data)
communicate(params_list, communication_op)