2323import tensorflow .compat .v2 as tf
2424import tensorflow_datasets .public_api as tfds
2525
26-
2726_DESCRIPTION = """\
2827 CNN/DailyMail non-anonymized summarization dataset.
2928
6362
6463_DL_URLS = {
6564 # pylint: disable=line-too-long
66- 'cnn_stories' : 'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ' ,
67- 'dm_stories' : 'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs' ,
68- 'test_urls' : 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt' ,
69- 'train_urls' : 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt' ,
70- 'val_urls' : 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt' ,
65+ 'cnn_stories' :
66+ 'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ' ,
67+ 'dm_stories' :
68+ 'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs' ,
69+ 'test_urls' :
70+ 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt' ,
71+ 'train_urls' :
72+ 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt' ,
73+ 'val_urls' :
74+ 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt' ,
7175 # pylint: enable=line-too-long
7276}
7377
7781 tfds .core .Version ('0.0.2' , experiments = {tfds .core .Experiment .S3 : False }),
7882 # Same data as 0.0.2
7983 tfds .core .Version ('1.0.0' ,
80- 'New split API (https://tensorflow.org/datasets/splits)' )]
84+ 'New split API (https://tensorflow.org/datasets/splits)' ),
85+ # Having the model predict newline separators makes it easier to evaluate
86+ # using summary-level ROUGE.
87+ tfds .core .Version ('2.0.0' , 'Separate target sentences with newline.' )
88+ ]
8189
82- # Having the model predict newline separators makes it easier to evaluate
83- # using summary-level ROUGE.
84- _DEFAULT_VERSION = tfds .core .Version ('2.0.0' ,
85- 'Separate target sentences with newline.' )
90+ # Using cased version.
91+ _DEFAULT_VERSION = tfds .core .Version ('3.0.0' , 'Using cased version.' )
8692
8793
8894class CnnDailymailConfig (tfds .core .BuilderConfig ):
@@ -109,6 +115,7 @@ def __init__(self, text_encoder_config=None, **kwargs):
109115def _get_url_hashes (path ):
110116 """Get hashes of urls in file."""
111117 urls = _read_text_file (path )
118+
112119 def url_hash (u ):
113120 h = hashlib .sha1 ()
114121 try :
@@ -117,6 +124,7 @@ def url_hash(u):
117124 logging .error ('Cannot hash url: %s' , u )
118125 h .update (u )
119126 return h .hexdigest ()
127+
120128 return {url_hash (u ): True for u in urls }
121129
122130
@@ -158,8 +166,10 @@ def _subset_filenames(dl_paths, split):
158166DM_SINGLE_CLOSE_QUOTE = u'\u2019 ' # unicode
159167DM_DOUBLE_CLOSE_QUOTE = u'\u201d '
160168# acceptable ways to end a sentence
161- END_TOKENS = ['.' , '!' , '?' , '...' , "'" , '`' , '"' ,
162- DM_SINGLE_CLOSE_QUOTE , DM_DOUBLE_CLOSE_QUOTE , ')' ]
169+ END_TOKENS = [
170+ '.' , '!' , '?' , '...' , "'" , '`' , '"' , DM_SINGLE_CLOSE_QUOTE ,
171+ DM_DOUBLE_CLOSE_QUOTE , ')'
172+ ]
163173
164174
165175def _read_text_file (text_file ):
@@ -177,19 +187,22 @@ def _get_art_abs(story_file, tfds_version):
177187
178188 lines = _read_text_file (story_file )
179189
180- # Lowercase everything
181- lines = [line .lower () for line in lines ]
190+ # The github code lowercase the text and we removed it in 3.0.0.
182191
183192 # Put periods on the ends of lines that are missing them
184193 # (this is a problem in the dataset because many image captions don't end in
185194 # periods; consequently they end up in the body of the article as run-on
186195 # sentences)
187196 def fix_missing_period (line ):
188197 """Adds a period to a line that is missing a period."""
189- if '@highlight' in line : return line
190- if not line : return line
191- if line [- 1 ] in END_TOKENS : return line
198+ if '@highlight' in line :
199+ return line
200+ if not line :
201+ return line
202+ if line [- 1 ] in END_TOKENS :
203+ return line
192204 return line + ' .'
205+
193206 lines = [fix_missing_period (line ) for line in lines ]
194207
195208 # Separate out article and abstract sentences
@@ -247,10 +260,12 @@ def _info(self):
247260 builder = self ,
248261 description = _DESCRIPTION ,
249262 features = tfds .features .FeaturesDict ({
250- _ARTICLE : tfds .features .Text (
251- encoder_config = self .builder_config .text_encoder_config ),
252- _HIGHLIGHTS : tfds .features .Text (
253- encoder_config = self .builder_config .text_encoder_config ),
263+ _ARTICLE :
264+ tfds .features .Text (
265+ encoder_config = self .builder_config .text_encoder_config ),
266+ _HIGHLIGHTS :
267+ tfds .features .Text (
268+ encoder_config = self .builder_config .text_encoder_config ),
254269 }),
255270 supervised_keys = (_ARTICLE , _HIGHLIGHTS ),
256271 homepage = 'https://github.com/abisee/cnn-dailymail' ,
@@ -278,17 +293,16 @@ def _split_generators(self, dl_manager):
278293 name = tfds .Split .TRAIN ,
279294 num_shards = 100 ,
280295 gen_kwargs = {'files' : train_files }),
281-
282296 tfds .core .SplitGenerator (
283297 name = tfds .Split .VALIDATION ,
284298 num_shards = 10 ,
285- gen_kwargs = {'files' : _subset_filenames (dl_paths ,
286- tfds .Split .VALIDATION )}),
299+ gen_kwargs = {
300+ 'files' : _subset_filenames (dl_paths , tfds .Split .VALIDATION )
301+ }),
287302 tfds .core .SplitGenerator (
288303 name = tfds .Split .TEST ,
289304 num_shards = 10 ,
290- gen_kwargs = {'files' : _subset_filenames (dl_paths ,
291- tfds .Split .TEST )})
305+ gen_kwargs = {'files' : _subset_filenames (dl_paths , tfds .Split .TEST )})
292306 ]
293307
294308 def _generate_examples (self , files ):
@@ -297,7 +311,4 @@ def _generate_examples(self, files):
297311 if not article or not highlights :
298312 continue
299313 fname = os .path .basename (p )
300- yield fname , {
301- _ARTICLE : article ,
302- _HIGHLIGHTS : highlights
303- }
314+ yield fname , {_ARTICLE : article , _HIGHLIGHTS : highlights }
0 commit comments