-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_create_training_data_facebook_and_google.R
109 lines (90 loc) · 4.19 KB
/
01_create_training_data_facebook_and_google.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# This file creates a training dataset
# Notably, we only use ads with coded party_all
# IF all pd_ids by the respective page_id are the same party
library(data.table)
library(dplyr)
library(haven)
# Outputs
path_training_data <- "data/2020_fb_and_google_with_page_id_based_training_data.csv.gz"
#----
# Google
# Inputs
path_google_2020_vars <- "../datasets/google/google_2020_adid_var1.csv.gz"
path_wmp_ent <- "../datasets/wmp_entity_files/Google/2020/wmp_google_entities_v040521.dta"
# Ad id, pd_id, page_id
df <- fread(path_google_2020_vars, encoding = "UTF-8")
df <- df %>% select(ad_id, advertiser_id)
# Merge with WMP entities file using pd_id
ent <- read_dta(path_wmp_ent)
ent$party_all <- paste0(ent$hse_party, ent$sen_party, ent$pres_party, ent$cmte_party)
ent$party_all[ent$party_all == "DEMDEM"] <- "DEM"
ent$party_all[ent$party_all == "REPREP"] <- "REP"
ent$party_all[ent$party_all == ""] <- "NOTCODED"
ent$party_all[!ent$party_all %in% c("DEM", "REP", "NOTCODED")] <- "OTHER"
ent <- ent %>% select(advertiser_id, party_all)
df <- left_join(df, ent, by = "advertiser_id")
# Usable party_all
# Count how many different parties a page_id contains
# It's theoretically possible for two pd_ids of the same page_id to have different parties
# This could for example happen when someone runs ads on GQ, Vogue, etc.
test <- aggregate(df$party_all, by = list(df$advertiser_id), table)
test$usable_party_all <- unlist(lapply(test$x, length)) == 1
test <- test %>% select(-x)
names(test)[1] <- "advertiser_id"
df <- left_join(df, test, by = "advertiser_id")
df$party_all_usable <- df$party_all
df$party_all_usable[df$usable_party_all == F] <- NA
df$party_all_usable[df$party_all_usable == "NOTCODED"] <- NA
df_g <- df
df_g$platform <- "Google"
#----
# Inputs
# fb_2020_140m_adid_var1.csv.gz is an output from repo fb_2020
path_fb_140m_vars <- "../fb_2020/fb_2020_140m_adid_var1.csv.gz"
path_wmp_ent <- "../datasets/wmp_entity_files/Facebook/2020/wmp_fb_entities_v090622.csv"
# Ad id, pd_id, page_id
df <- fread(path_fb_140m_vars, encoding = "UTF-8")
df <- df %>% select(ad_id, pd_id, page_id)
# Merge with WMP entities file using pd_id
ent <- fread(path_wmp_ent)
ent <- ent %>% select(pd_id, party_all)
ent <- ent %>% filter(party_all != "MISSING")
df <- left_join(df, ent, by = "pd_id")
# Usable party_all
# First of all, check which don't have party coded
df$party_all[is.na(df$party_all)] <- "NOTCODED"
# Then count how many different parties a page_id contains
# It's theoretically possible for two pd_ids of the same page_id to have different parties
# This could for example happen when someone runs ads on GQ, Vogue, etc.
test <- aggregate(df$party_all, by = list(df$page_id), table)
test$usable_party_all <- unlist(lapply(test$x, length)) == 1
test <- test %>% select(-x)
names(test)[1] <- "page_id"
df <- left_join(df, test, by = "page_id")
df$party_all_usable <- df$party_all
df$party_all_usable[df$usable_party_all == F] <- NA
df$party_all_usable[df$party_all_usable == "NOTCODED"] <- NA
df_f <- df
df_f$platform <- "Facebook"
# Make advertiser_id for FB pd_id, then combine FB and Google
df_f$advertiser_id <- df_f$pd_id
df_f <- df_f %>% select(-c(pd_id, page_id))
df <- bind_rows(df_f, df_g)
#----
# Create the train-test split
# Usable advertisers
df <- as.data.frame(df)
df_usable <- df[(is.na(df$party_all_usable) == F) & (is.na(df$advertiser_id) == F),]
# Count how many ads there are per advertiser
df_advertiser_id_counts <- df_usable %>% group_by(advertiser_id) %>% count()
df_advertiser_id_counts <- df_usable %>% select(advertiser_id, party_all) %>% right_join(df_advertiser_id_counts, "advertiser_id")
# Create the test set
# Sample 10 D and R advertisers who have between 500 and 1000 ads
df_advertiser_id_counts_500_1000 <- df_advertiser_id_counts[(df_advertiser_id_counts$n > 500) & (df_advertiser_id_counts$n < 1000),]
set.seed(123)
test_Dem <- sample(df_advertiser_id_counts_500_1000$advertiser_id[df_advertiser_id_counts_500_1000$party_all == "DEM"], 10)
set.seed(123)
test_Rep <- sample(df_advertiser_id_counts_500_1000$advertiser_id[df_advertiser_id_counts_500_1000$party_all == "REP"], 10)
df_usable$split <- "train"
df_usable$split[df_usable$advertiser_id %in% c(test_Dem, test_Rep)] <- "test"
fwrite(df_usable, path_training_data)