@@ -59,8 +59,8 @@ def print_memory_usage():
59
59
# Lets the Id:(Stack Overflow int) and id:(GraphFrames ULID) coexist
60
60
.config ("spark.sql.caseSensitive" , True )
61
61
# Single node mode - 128GB machine
62
- .config ("spark.driver.memory" , "48g " )
63
- .config ("spark.executor.memory" , "48g " )
62
+ .config ("spark.driver.memory" , "16g " )
63
+ .config ("spark.executor.memory" , "8g " )
64
64
.getOrCreate ()
65
65
)
66
66
sc : SparkContext = spark .sparkContext
@@ -210,26 +210,26 @@ def add_missing_columns(df, all_cols):
210
210
#
211
211
# Create a [User]--Cast-->[Vote] edge
212
212
#
213
- user_voted_df : DataFrame = users_df .select (
214
- F .col ("id" ).alias ("src" ),
215
- F .col ("Id" ).alias ("UserId" ),
216
- # Everything has all the fields - should build from base records but need UUIDs
217
- F .col ("PostId" ).alias ("VotePostId" ),
218
- )
219
- user_voted_edge_df : DataFrame = (
220
- user_voted_df .join (votes_df , user_voted_df .UserId == votes_df .Id )
221
- .select (
222
- # 'src' comes from the votes' 'id'
223
- "src" ,
224
- # 'dst' comes from the posts' 'id'
225
- F .col ("id" ).alias ("dst" ),
226
- # All edges have a 'relationship' field
227
- F .lit ("Cast" ).alias ("relationship" ),
228
- )
229
- .cache ()
230
- )
231
- print (f"Total VotedFor edges: { voted_for_edge_df .count ():,} " )
232
- print (f"Percentage of linked votes: { voted_for_edge_df .count () / votes_df .count ():.2%} \n " )
213
+ # user_voted_df: DataFrame = users_df.select(
214
+ # F.col("id").alias("src"),
215
+ # F.col("Id").alias("UserId"),
216
+ # # Everything has all the fields - should build from base records but need UUIDs
217
+ # F.col("PostId").alias("VotePostId"),
218
+ # )
219
+ # user_voted_edge_df: DataFrame = (
220
+ # user_voted_df.join(votes_df, user_voted_df.UserId == votes_df.Id)
221
+ # .select(
222
+ # # 'src' comes from the votes' 'id'
223
+ # "src",
224
+ # # 'dst' comes from the posts' 'id'
225
+ # F.col("id").alias("dst"),
226
+ # # All edges have a 'relationship' field
227
+ # F.lit("Cast").alias("relationship"),
228
+ # )
229
+ # .cache()
230
+ # )
231
+ # print(f"Total VotedFor edges: {voted_for_edge_df.count():,}")
232
+ # print(f"Percentage of linked votes: {voted_for_edge_df.count() / votes_df.count():.2%}\n")
233
233
234
234
235
235
#
0 commit comments