-
Notifications
You must be signed in to change notification settings - Fork 0
/
2_MapReduce_CloudVenetoCluster.py
169 lines (150 loc) · 6.73 KB
/
2_MapReduce_CloudVenetoCluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
from collections import defaultdict
from pyspark import SparkContext, SparkConf
import os
import sys
import time
from statistics import median
import random
def CountTriangles(edges):
# Create a defaultdict to store the neighbors of each vertex
neighbors = defaultdict(set)
for edge in edges:
u, v = edge
neighbors[u].add(v)
neighbors[v].add(u)
# Initialize the triangle count to zero
triangle_count = 0
# Iterate over each vertex in the graph.
# To avoid duplicates, we count a triangle <u, v, w> only if u<v<w
for u in neighbors:
# Iterate over each pair of neighbors of u
for v in neighbors[u]:
if v > u:
for w in neighbors[v]:
# If w is also a neighbor of u, then we have a triangle
if w > v and w in neighbors[u]:
triangle_count += 1
# Return the total number of triangles in the graph
return triangle_count
def countTriangles2(colors_tuple, edges, rand_a, rand_b, p, num_colors):
# We assume colors_tuple to be already sorted by increasing colors. Just transform in a list for simplicity
colors = list(colors_tuple)
# Create a dictionary for adjacency list
neighbors = defaultdict(set)
# Create a dictionary for storing node colors
node_colors = dict()
for edge in edges:
u, v = edge
node_colors[u] = ((rand_a * u + rand_b) % p) % num_colors
node_colors[v] = ((rand_a * v + rand_b) % p) % num_colors
neighbors[u].add(v)
neighbors[v].add(u)
# Initialize the triangle count to zero
triangle_count = 0
# Iterate over each vertex in the graph
for v in neighbors:
# Iterate over each pair of neighbors of v
for u in neighbors[v]:
if u > v:
for w in neighbors[u]:
# If w is also a neighbor of v, then we have a triangle
if w > u and w in neighbors[v]:
# Sort colors by increasing values
triangle_colors = sorted((node_colors[u], node_colors[v], node_colors[w]))
# If triangle has the right colors, count it.
if colors == triangle_colors:
triangle_count += 1
# Return the total number of triangles in the graph
return triangle_count
def hash_function(u, params):
a, b, p, C = params
return ((a * u + b) % p) % C
def MR_ApproxTCwithNodeColors(rdd, C=1):
p = 8191
a = random.randint(1, p - 1)
b = random.randint(0, p - 1)
params = [a, b, p, C]
def mapEdgesUsingColor(edge):
hc_u = hash_function(edge[0], params)
hc_v = hash_function(edge[1], params)
if hc_u == hc_v:
return [(hc_u, edge)]
return []
return C ** 2 * rdd.flatMap(lambda x: mapEdgesUsingColor(x)) \
.groupByKey() \
.map(lambda x: CountTriangles(x[1])) \
.reduce(lambda x, y: x + y)
def MR_ExactTC(rdd, C=1):
p = 8191
a = random.randint(1, p - 1)
b = random.randint(0, p - 1)
params = [a, b, p, C]
return rdd.flatMap(lambda e: [(tuple(sorted((hash_function(e[0], params), hash_function(e[1], params), i))), e)
for i in range(C)]).groupByKey()\
.map(lambda x: countTriangles2(x[0], x[1], *params)) \
.reduce(lambda x, y: x + y)
def main():
# SPARK SETUP
conf = SparkConf().setAppName('TriangleCounting_CloudVeneto_HW2')
conf.set("spark.locality.wait", "0s")
sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")
# INPUT READING
C = int(sys.argv[1])
R = int(sys.argv[2])
F = int(sys.argv[3])
file_path = sys.argv[4]
rawData = sc.textFile(file_path, minPartitions=C). \
map(lambda x: tuple(map(int, x.strip().split(",")))). \
repartition(numPartitions=32). \
cache()
file_name = os.path.basename(file_path)
num_edges = rawData.count()
num_executors = sc.getConf().get("spark.executor.instances")
if not F:
t_final_aprox_list = []
running_time_aprox_list = []
for _ in range(R):
start_time = time.time()
t_final = MR_ApproxTCwithNodeColors(rawData, C)
end_time = time.time()
t_final_aprox_list.append(t_final)
print(end_time, '-', start_time, '=', end_time - start_time)
running_time_aprox_list.append(end_time - start_time)
print(running_time_aprox_list)
runningTime_MR_ApproxTCwithNodeColors = int(1000 * (sum(running_time_aprox_list) / R))
median_MR_ApproxTCwithNodeColors = median(t_final_aprox_list)
print('OUTPUT with parameters: with {} executors, {} colors, {} repetitions, flag {}, file {}\n\n\
Dataset = {}\n\
Number of Edges = {}\n\
Number of Colors = {}\n\
Number of Repetitions = {}\n\
Approximation algorithm with node coloring\n\
- Number of triangles = {}\n\
- Running time (average over {} runs) = {} ms'.format(num_executors, C, R, F, file_name,
file_name, num_edges, C, R,
median_MR_ApproxTCwithNodeColors, R,
runningTime_MR_ApproxTCwithNodeColors))
else:
t_final_exact_list = []
running_time_exact_list = []
for _ in range(R):
start_time = time.time()
t_final = MR_ExactTC(rawData, C)
end_time = time.time()
t_final_exact_list.append(t_final)
running_time_exact_list.append(end_time - start_time)
runningTime_MR_ExactTC = int(1000 * (sum(running_time_exact_list) / R))
last_MR_ExactTC = t_final_exact_list[-1]
print('OUTPUT with parameters: with {} executors, {} colors, {} repetitions, flag {}, file {}\n\n\
Dataset = {}\n\
Number of Edges = {}\n\
Number of Colors = {}\n\
Number of Repetitions = {}\n\
Exact algorithm with node coloring\n\
- Number of triangles = {}\n\
- Running time (average over {} runs) = {} ms'.format(num_executors, C, R, F, file_name,
file_name, num_edges, C, R, last_MR_ExactTC, R,
runningTime_MR_ExactTC))
if __name__ == '__main__':
main()