@@ -59,8 +59,8 @@ def print_memory_usage():
5959 # Lets the Id:(Stack Overflow int) and id:(GraphFrames ULID) coexist
6060 .config ("spark.sql.caseSensitive" , True )
6161 # Single node mode - 128GB machine
62- .config ("spark.driver.memory" , "48g " )
63- .config ("spark.executor.memory" , "48g " )
62+ .config ("spark.driver.memory" , "16g " )
63+ .config ("spark.executor.memory" , "8g " )
6464 .getOrCreate ()
6565)
6666sc : SparkContext = spark .sparkContext
@@ -210,26 +210,26 @@ def add_missing_columns(df, all_cols):
210210#
211211# Create a [User]--Cast-->[Vote] edge
212212#
213- user_voted_df : DataFrame = users_df .select (
214- F .col ("id" ).alias ("src" ),
215- F .col ("Id" ).alias ("UserId" ),
216- # Everything has all the fields - should build from base records but need UUIDs
217- F .col ("PostId" ).alias ("VotePostId" ),
218- )
219- user_voted_edge_df : DataFrame = (
220- user_voted_df .join (votes_df , user_voted_df .UserId == votes_df .Id )
221- .select (
222- # 'src' comes from the votes' 'id'
223- "src" ,
224- # 'dst' comes from the posts' 'id'
225- F .col ("id" ).alias ("dst" ),
226- # All edges have a 'relationship' field
227- F .lit ("Cast" ).alias ("relationship" ),
228- )
229- .cache ()
230- )
231- print (f"Total VotedFor edges: { voted_for_edge_df .count ():,} " )
232- print (f"Percentage of linked votes: { voted_for_edge_df .count () / votes_df .count ():.2%} \n " )
213+ # user_voted_df: DataFrame = users_df.select(
214+ # F.col("id").alias("src"),
215+ # F.col("Id").alias("UserId"),
216+ # # Everything has all the fields - should build from base records but need UUIDs
217+ # F.col("PostId").alias("VotePostId"),
218+ # )
219+ # user_voted_edge_df: DataFrame = (
220+ # user_voted_df.join(votes_df, user_voted_df.UserId == votes_df.Id)
221+ # .select(
222+ # # 'src' comes from the votes' 'id'
223+ # "src",
224+ # # 'dst' comes from the posts' 'id'
225+ # F.col("id").alias("dst"),
226+ # # All edges have a 'relationship' field
227+ # F.lit("Cast").alias("relationship"),
228+ # )
229+ # .cache()
230+ # )
231+ # print(f"Total VotedFor edges: {voted_for_edge_df.count():,}")
232+ # print(f"Percentage of linked votes: {voted_for_edge_df.count() / votes_df.count():.2%}\n")
233233
234234
235235#
0 commit comments