forked from eaniket/rackathon
-
Notifications
You must be signed in to change notification settings - Fork 0
/
BERT.py
398 lines (347 loc) · 26.7 KB
/
BERT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
import numpy as np
import torch
import time
from pytorch_pretrained_bert import (GPT2LMHeadModel, GPT2Tokenizer,
BertTokenizer, BertForMaskedLM)
#from .class_register import register_api
class AbstractLanguageChecker():
"""
Abstract Class that defines the Backend API of GLTR.
To extend the GLTR interface, you need to inherit this and
fill in the defined functions.
"""
def __init__(self):
'''
In the subclass, you need to load all necessary components
for the other functions.
Typically, this will comprise a tokenizer and a model.
'''
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu")
def check_probabilities(self, in_text, topk=40):
'''
Function that GLTR interacts with to check the probabilities of words
Params:
- in_text: str -- The text that you want to check
- topk: int -- Your desired truncation of the head of the distribution
Output:
- payload: dict -- The wrapper for results in this function, described below
Payload values
==============
bpe_strings: list of str -- Each individual token in the text
real_topk: list of tuples -- (ranking, prob) of each token
pred_topk: list of list of tuple -- (word, prob) for all topk
'''
raise NotImplementedError
def postprocess(self, token):
"""
clean up the tokens from any special chars and encode
leading space by UTF-8 code '\u0120', linebreak with UTF-8 code 266 '\u010A'
:param token: str -- raw token text
:return: str -- cleaned and re-encoded token text
"""
raise NotImplementedError
def top_k_logits(logits, k):
'''
Filters logits to only the top k choices
from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_gpt2.py
'''
if k == 0:
return logits
values, _ = torch.topk(logits, k)
min_values = values[:, -1]
return torch.where(logits < min_values,
torch.ones_like(logits, dtype=logits.dtype) * -1e10,
logits)
#@register_api(name='gpt-2-small')
class LM(AbstractLanguageChecker):
def __init__(self, model_name_or_path="gpt2"):
super(LM, self).__init__()
self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path)
self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
self.model.to(self.device)
self.model.eval()
self.start_token = '<|endoftext|>'
print("Loaded GPT-2 model!")
def check_probabilities(self, in_text, topk=40):
# Process input
start_t = torch.full((1, 1),
self.enc.encoder[self.start_token],
device=self.device,
dtype=torch.long)
context = self.enc.encode(in_text)
context = torch.tensor(context,
device=self.device,
dtype=torch.long).unsqueeze(0)
context = torch.cat([start_t, context], dim=1)
# Forward through the model
logits, _ = self.model(context)
# construct target and pred
yhat = torch.softmax(logits[0, :-1], dim=-1)
y = context[0, 1:]
# Sort the predictions for each timestep
sorted_preds = np.argsort(-yhat.data.cpu().numpy())
# [(pos, prob), ...]
real_topk_pos = list(
[int(np.where(sorted_preds[i] == y[i].item())[0][0])
for i in range(y.shape[0])])
real_topk_probs = yhat[np.arange(
0, y.shape[0], 1), y].data.cpu().numpy().tolist()
real_topk_probs = list(map(lambda x: round(x, 5), real_topk_probs))
real_topk = list(zip(real_topk_pos, real_topk_probs))
# [str, str, ...]
bpe_strings = [self.enc.decoder[s.item()] for s in context[0]]
bpe_strings = [self.postprocess(s) for s in bpe_strings]
# [[(pos, prob), ...], [(pos, prob), ..], ...]
pred_topk = [
list(zip([self.enc.decoder[p] for p in sorted_preds[i][:topk]],
list(map(lambda x: round(x, 5),
yhat[i][sorted_preds[i][
:topk]].data.cpu().numpy().tolist()))))
for i in range(y.shape[0])]
pred_topk = [[(self.postprocess(t[0]), t[1]) for t in pred] for pred in pred_topk]
payload = {'bpe_strings': bpe_strings,
'real_topk': real_topk,
'pred_topk': pred_topk}
if torch.cuda.is_available():
torch.cuda.empty_cache()
count = 0
for i in range(0,len(real_topk)):
if real_topk[i][0] == 0:
count = count+1
#print("count: "+str(count))
print(count/len(real_topk))
print(len(real_topk))
if count>= (0.50*len(real_topk)):
print("Yes, it is!!!")
else:
print("No, its human generated!!!")
return payload
def sample_unconditional(self, length=100, topk=5, temperature=1.0):
'''
Sample `length` words from the model.
Code strongly inspired by
https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_gpt2.py
'''
context = torch.full((1, 1),
self.enc.encoder[self.start_token],
device=self.device,
dtype=torch.long)
prev = context
output = context
past = None
# Forward through the model
with torch.no_grad():
for i in range(length):
logits, past = self.model(prev, past=past)
logits = logits[:, -1, :] / temperature
# Filter predictions to topk and softmax
probs = torch.softmax(top_k_logits(logits, k=topk),
dim=-1)
# Sample
prev = torch.multinomial(probs, num_samples=1)
# Construct output
output = torch.cat((output, prev), dim=1)
output_text = self.enc.decode(output[0].tolist())
return output_text
def postprocess(self, token):
with_space = False
with_break = False
if token.startswith('Ġ'):
with_space = True
token = token[1:]
# print(token)
elif token.startswith('â'):
token = ' '
elif token.startswith('Ċ'):
token = ' '
with_break = True
token = '-' if token.startswith('â') else token
token = '“' if token.startswith('ľ') else token
token = '”' if token.startswith('Ŀ') else token
token = "'" if token.startswith('Ļ') else token
if with_space:
token = '\u0120' + token
if with_break:
token = '\u010A' + token
return token
# #@register_api(name='BERT')
# class BERTLM(AbstractLanguageChecker):
# def __init__(self, model_name_or_path="bert-base-cased"):
# super(BERTLM, self).__init__()
# self.device = torch.device(
# "cuda" if torch.cuda.is_available() else "cpu")
# self.tokenizer = BertTokenizer.from_pretrained(
# model_name_or_path,
# do_lower_case=False)
# self.model = BertForMaskedLM.from_pretrained(
# model_name_or_path)
# self.model.to(self.device)
# self.model.eval()
# # BERT-specific symbols
# self.mask_tok = self.tokenizer.convert_tokens_to_ids(["[MASK]"])[0]
# self.pad = self.tokenizer.convert_tokens_to_ids(["[PAD]"])[0]
# print("Loaded BERT model!")
# def check_probabilities(self, in_text, topk=40, max_context=20,
# batch_size=20):
# '''
# Same behavior as GPT-2
# Extra param: max_context controls how many words should be
# fed in left and right
# Speeds up inference since BERT requires prediction word by word
# '''
# in_text = "[CLS] " + in_text + " [SEP]"
# tokenized_text = self.tokenizer.tokenize(in_text)
# # Construct target
# y_toks = self.tokenizer.convert_tokens_to_ids(tokenized_text)
# # Only use sentence A embedding here since we have non-separable seq's
# segments_ids = [0] * len(y_toks)
# y = torch.tensor([y_toks]).to(self.device)
# segments_tensor = torch.tensor([segments_ids]).to(self.device)
# # TODO batching...
# # Create batches of (x,y)
# input_batches = []
# target_batches = []
# for min_ix in range(0, len(y_toks), batch_size):
# max_ix = min(min_ix + batch_size, len(y_toks) - 1)
# cur_input_batch = []
# cur_target_batch = []
# # Construct each batch
# for running_ix in range(max_ix - min_ix):
# tokens_tensor = y.clone()
# mask_index = min_ix + running_ix
# tokens_tensor[0, mask_index + 1] = self.mask_tok
# # Reduce computational complexity by subsetting
# min_index = max(0, mask_index - max_context)
# max_index = min(tokens_tensor.shape[1] - 1,
# mask_index + max_context + 1)
# tokens_tensor = tokens_tensor[:, min_index:max_index]
# # Add padding
# needed_padding = max_context * 2 + 1 - tokens_tensor.shape[1]
# if min_index == 0 and max_index == y.shape[1] - 1:
# # Only when input is shorter than max_context
# left_needed = (max_context) - mask_index
# right_needed = needed_padding - left_needed
# p = torch.nn.ConstantPad1d((left_needed, right_needed),
# self.pad)
# tokens_tensor = p(tokens_tensor)
# elif min_index == 0:
# p = torch.nn.ConstantPad1d((needed_padding, 0), self.pad)
# tokens_tensor = p(tokens_tensor)
# elif max_index == y.shape[1] - 1:
# p = torch.nn.ConstantPad1d((0, needed_padding), self.pad)
# tokens_tensor = p(tokens_tensor)
# cur_input_batch.append(tokens_tensor)
# cur_target_batch.append(y[:, mask_index + 1])
# # new_segments = segments_tensor[:, min_index:max_index]
# cur_input_batch = torch.cat(cur_input_batch, dim=0)
# cur_target_batch = torch.cat(cur_target_batch, dim=0)
# input_batches.append(cur_input_batch)
# target_batches.append(cur_target_batch)
# real_topk = []
# pred_topk = []
# with torch.no_grad():
# for src, tgt in zip(input_batches, target_batches):
# # Compute one batch of inputs
# # By construction, MASK is always the middle
# logits = self.model(src, torch.zeros_like(src))[:,
# max_context + 1]
# yhat = torch.softmax(logits, dim=-1)
# sorted_preds = np.argsort(-yhat.data.cpu().numpy())
# # TODO: compare with batch of tgt
# # [(pos, prob), ...]
# real_topk_pos = list(
# [int(np.where(sorted_preds[i] == tgt[i].item())[0][0])
# for i in range(yhat.shape[0])])
# real_topk_probs = yhat[np.arange(
# 0, yhat.shape[0], 1), tgt].data.cpu().numpy().tolist()
# real_topk.extend(list(zip(real_topk_pos, real_topk_probs)))
# # # [[(pos, prob), ...], [(pos, prob), ..], ...]
# pred_topk.extend([list(zip(self.tokenizer.convert_ids_to_tokens(
# sorted_preds[i][:topk]),
# yhat[i][sorted_preds[i][
# :topk]].data.cpu().numpy().tolist()))
# for i in range(yhat.shape[0])])
# bpe_strings = [self.postprocess(s) for s in tokenized_text]
# pred_topk = [[(self.postprocess(t[0]), t[1]) for t in pred] for pred in pred_topk]
# payload = {'bpe_strings': bpe_strings,
# 'real_topk': real_topk,
# 'pred_topk': pred_topk}
# # print(len(real_topk))
# # count = 0
# # for i in range(0,len(real_topk)):
# # if real_topk[i][0] == 0:
# # count = count+1
# # print("count: "+str(count))
# # if count>= (0.65*len(real_topk)):
# # print("Yes, it is!!!")
# # else:
# # print("No, its human generated!!!")
# #print(real_topk[0])
# return payload
# def postprocess(self, token):
# with_space = True
# with_break = token == '[SEP]'
# if token.startswith('##'):
# with_space = False
# token = token[2:]
# if with_space:
# token = '\u0120' + token
# if with_break:
# token = '\u010A' + token
# #
# # # print ('....', token)
# return token
def main():
#raw_text =
"""
In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.
The scientist named the population, after their distinctive horn, Ovid’s Unicorn. These four-horned, silver-white unicorns were previously unknown to science.
Now, after almost two centuries, the mystery of what sparked this odd phenomenon is finally solved.
Dr. Jorge Pérez, an evolutionary biologist from the University of La Paz, and several companions, were exploring the Andes Mountains when they found a small valley, with no other animals or humans. Pérez noticed that the valley had what appeared to be a natural fountain, surrounded by two peaks of rock and silver snow.
Pérez and the others then ventured further into the valley. “By the time we reached the top of one peak, the water looked blue, with some crystals on top,” said Pérez.
Pérez and his friends were astonished to see the unicorn herd. These creatures could be seen from the air without having to move too much to see them – they were so close they could touch their horns.
While examining these bizarre creatures the scientists discovered that the creatures also spoke some fairly regular English. Pérez stated, “We can see, for example, that they have a common ‘language,’ something like a dialect or dialectic.”
Dr. Pérez believes that the unicorns may have originated in Argentina, where the animals were believed to be descendants of a lost race of people who lived there before the arrival of humans in those parts of South America.
While their origins are still unclear, some believe that perhaps the creatures were created when a human and a unicorn met each other in a time before human civilization. According to Pérez, “In South America, such incidents seem to be quite common.”
However, Pérez also pointed out that it is likely that the only way of knowing for sure if unicorns are indeed the descendants of a lost alien race is through DNA. “But they seem to be able to communicate in English quite well, which I believe is a sign of evolution, or at least a change in social organization,” said the scientist.
"""
#Machine text
#raw_text = "The following is a transcript from The Guardian's interview with the British ambassador to the UN, John Baird. Baird: The situation in Syria is very dire. We have a number of reports of chemical weapons being used in the country. The Syrian opposition has expressed their willingness to use chemical weapons. We have a number of people who have been killed, many of them civilians. I think it is important to understand this. There are many who are saying that the chemical weapons used in Syria are not only used to destroy people but also to destroy the Syrian people. The Syrian people have been suffering for many years. The regime is responsible for that suffering. They have been using chemical weapons. They have killed many people, and they continue to kill many more. I think that the international community has to take a position that the Assad regime has a responsibility for that suffering. It must take a stand that we are not going to allow the Syrian government to use chemical weapons on civilians, that we are not going to allow them, and that we do not condone their use. We have a lot of people who believe that the regime is responsible for this suffering, and that they are responsible for this suffering, and that they are responsible for the use of chemical weapons. I think that we need to be clear about that. We must be clear that the use of chemical weapons by any country, including Russia and Iran, is a violation of international law. We are not going to tolerate that. We do not tolerate that. And we have the responsibility to ensure that the world doesn't allow the Assad regime to use chemical weapons against civilians."
#raw_text = "The following is a transcript from The Guardian's interview with the British ambassador to the UN, John Baird.Baird: The situation in Syria is very dire. We have a number of reports of chemical weapons being used in the country. The Syrian opposition has expressed their willingness to use chemical weapons. We have a number of people who have been killed, many of them civilians. I think it is important to understand this.There are many who are saying that the chemical weapons used in Syria are not only used to destroy people but also to destroy the Syrian people. The Syrian people have been suffering for many years. The regime is responsible for that suffering. They have been using chemical weapons. They have killed many people, and they continue to kill many more.I think that the international community has to take a position that the Assad regime has a responsibility for that suffering. It must take a stand that we are not going to allow the Syrian government to use chemical weapons on civilians, that we are not going to allow them, and that we do not condone their use.We have a lot of people who believe that the regime is responsible for this suffering, and that they are responsible for this suffering, and that they are responsible for the use of chemical weapons. I think that we need to be clear about that.We must be clear that the use of chemical weapons by any country, including Russia and Iran, is a violation of international law. We are not going to tolerate that. We do not tolerate that. And we have the responsibility to ensure that the world doesn't allow the Assad regime to use chemical weapons against civilians.Baird: It seems that there are a range of people that are saying that we are not allowed to use chemical weapons in Syria. There are many who say we are not allowed to use chemical weapons in Syria.I think there are a lot of people that are saying that we are not allowed to use chemical weapons in Syria. I think that we have to take a stand that we are not going to allow the Assad regime to use chemical weapons on civilians, that we are not going to tolerate that. We have to take a stand that we are not going to allow Russia and Iran to use chemical weapons on civilians.Baird: I think it is important for us to understand that the use of chemical weapons in Syria is an extremely dangerous situation. I think there has been very little information from the UN that the regime has used any chemical weapons. We have not seen any evidence that they are using them.We have to understand that the use of chemical weapons is very dangerous."
#Human text
#raw_text = "In this work, we study the internal representations of GANs. To a human observer, a well-trained GAN appears to have learned facts about the objects in the image: for example, a door can appear on a building but not on a tree. We wish to understand how a GAN represents such structure. Do the objects emerge as pure pixel patterns without any explicit representation of objects such as doors and trees, or does the GAN contain internal variables that correspond to the objects that humans perceive? If the GAN does contain variables for doors and trees, do those variables cause the generation of those objects, or do they merely correlate? How are relationships between objects represented? By carefully examining representation units, we have found that many parts of GAN representations can be interpreted, not only as signals that correlate with object concepts but as variables that have a causal effect on the synthesis of objects in the output. These interpretable effects can be used to compare, debug, modify, and reason about a GAN model. Our method can be potentially applied to other generative models such as VAEs and RealNVP."
#raw_text = "In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English."
raw_text = "Screenwriter Ryan Murphy, who has produced the FX series American Crime Story, is set to bring the Monica Clinton White House sex saga to TV. According to the Hollywood Reporter, the Ryan Murphy Productions chief has optioned author and CNN legal analyst Jeffrey Tobins 2000 book A Vast Conspiracy: The Real Sex Scandal That Nearly Brought Down a President. The New York Times bestseller, acquired by Fox 21 Television Studios and FX Productions, will become the basis for a future American Crime Story season. In February, Murphy told E! News that the series would explore the Lewinsky sex scandal as plot to tear down President Bill Clinton, and on the other women who were ensnared in the 1996 sex scandal, involving House intern Monica Lewinsky, and the events that led to Clintons impeachment. Its not really about Hillary Clinton. That book is about the rise of a certain segment of a group of people who despised the Clintons and used three women, Paula Jones, Monica Lewinsky and Linda Tripp to try and tear him down, Murphy said. In February, Murphy announced that actress Sarah Paulson who starred in the first season of his crime drama, The People vs. O. J. Simpson has been confirmed for a role, but ruled out that it would be of Lady Hillary Clinton. The mogul has reportedly confirmed that his studio is looking actresses to portray Lewinsky and Tripp. Season two of the Golden Globe and Primetime Emmy show will tackle Hurricane Katrina, and is set to premier in 2018. Season three, he confirmed will focus on the 1997 assassination of Italian fashion designer Gianni Versace, singer Ricky Martin has already joined the cast. Follow Jerome Hudson on Twitter: @JeromeEHudson"
#raw_text="The Dutch Central Bank Announces Digital Technologies Will Be A Higher Priority For Its Supervisory Approach President Klaas Knot of the central bank of the Netherlands (De Nederlandsche Bank, DNB) speaks . during the presentation for the annual report of the DNB in Amsterdam, on March . AFP PHOTO ANP Koen van Weel Netherlands OUT (Photo credit should read KOEN VAN WEELAFP via Getty Images)On January 22nd, the Dutch Central Bank (DNB) announced that data and the use of digital technologies would be a supervisory focus for 2020. The DNB has been outspoken among national financial institutions in its support for settlement and payment system technological innovations, which includes blockchain and crypto forums. The DNB has also published its annual Supervision Outlook for 2020, which explains in detail their priorities. A key highlight, from the 2020 report, for the blockchain sector, is that the DNB will start using AMLD5 (the new E.U. antimoney laundering directive) to monitor crypto enterprises.The supervisory guidance explained in the DNB report, “stresses that combating financial and economic crime continues to play a key role in our supervision. Integrity is a crucial precondition for trust in the financial sector, and the eyes of society are on our efforts to combat money laundering.” The AMLD5 rule took effect on January 10th, 2020. Know-your-customer (KYC) compliance costs are expected to increase in the industry significantly. All virtual currency and blockchain style firms operating in the Netherlands must register with DNB. Complicating matters for E.U. based crypto firms could be the impact of Brexit. Klaas Knot, President of the DNB, in opening remarks at the SUERF conference on January 8th, said that “Central banks and supervisors on both sides of the Channel will continue to coordinate our efforts in the IMF, the FSB, the BIS and other standard-setting bodies.” While the SUERF conference focused on the economic relationship between the U.K. and the E.U., cooperation will need to extend to crypto regulatory standards, like KYC, to foster compliance across jurisdictions.Potentially the two top challenges for European crypto firms implementing AMLD5 KYC changes are: the identification of holders’ bank accounts and crypto wallets, and the expansion of the virtual currency provider list.The DNB also said, in a separate bulletin on January 22nd, that “In this highly dynamic environment, institutions must give first priority to the security, governance, optimum use and quality of data.” The DNB emphasized that trust in the marketplace depends on the careful use of personal data. The DNB also maintains, while crypto coins, like Bitcoin, carry investment risk and are not backed by central banks, they recognize the opportunities for blockchain and distributed ledger technology to contribute to cheaper and more efficient cross border payments.I report on public adoption of cryptocurrency, collateralized tokens and stable coins by banks and enterprises. My coverage includes blockchain and distributed ledger"
'''
Tests for BERT
'''
# lm = BERTLM()
# start = time.time()
# count = lm.check_probabilities(raw_text, topk=5)
# #print('****************BERT*****************')
# #print(payload)
# end = time.time()
#print("{:.2f} Seconds for a run with BERT".format(end - start))
# print("SAMPLE:", sample)
'''
Tests for GPT-2
'''
lm = LM()
start = time.time()
payload = lm.check_probabilities(raw_text, topk=5)
#print("****************ans********************** : "+str(ans))
# if ans==0:
# print("No, its human generated!!!")
# else:
# print("Yes, it is!!!")
end = time.time()
print("{:.2f} Seconds for a check with GPT-2".format(end - start))
start = time.time()
sample = lm.sample_unconditional()
end = time.time()
# print("{:.2f} Seconds for a sample from GPT-2".format(end - start))
# print("SAMPLE:", sample)
if __name__ == "__main__":
main()