Skip to content

Commit

Permalink
Merge pull request #585 from PrimozGodec/fix-corpus-from-table
Browse files Browse the repository at this point in the history
[FIX] Corpus - from_table: keep text feature when renamed
  • Loading branch information
ajdapretnar authored Oct 9, 2020
2 parents 38ac5df + 2060c7b commit 63bb2f2
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 2 deletions.
18 changes: 16 additions & 2 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,7 @@ def retain_preprocessing(orig, new, key=...):
if isinstance(orig, Corpus):
if isinstance(key, tuple): # get row selection
key = key[0]

if orig._tokens is not None: # retain preprocessing
if isinstance(key, Integral):
new._tokens = np.array([orig._tokens[key]])
Expand All @@ -606,9 +607,22 @@ def retain_preprocessing(orig, new, key=...):
else:
raise TypeError('Indexing by type {} not supported.'.format(type(key)))
new._dictionary = orig._dictionary

if isinstance(new, Corpus):
# _find_identical_feature returns non when feature not found
# filter this Nones from list
new.text_features = list(filter(None, [
new._find_identical_feature(tf)
for tf in orig.text_features
]))
else:
new.text_features = [
tf
for tf in orig.text_features
if tf in set(new.domain.metas)
]

new._titles = orig._titles[key]
new_domain_metas = set(new.domain.metas)
new.text_features = [tf for tf in orig.text_features if tf in new_domain_metas]
new.ngram_range = orig.ngram_range
new.attributes = orig.attributes
new.used_preprocessor = orig.used_preprocessor
Expand Down
12 changes: 12 additions & 0 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,18 @@ def test_from_table(self):
np.testing.assert_equal(t.metas, c.metas)
self.assertEqual(c.text_features, [t.domain.metas[0]])

def test_from_table_renamed(self):
c1 = Corpus.from_file('book-excerpts')
new_domain = Domain(c1.domain.attributes, metas=[c1.domain.metas[0].renamed("text1")])

# when text feature renamed
c2 = Corpus.from_table(new_domain, c1)
self.assertIsInstance(c2, Corpus)
self.assertEqual(len(c1), len(c2))
np.testing.assert_equal(c1.metas, c2.metas)
self.assertEqual(1, len(c2.text_features))
self.assertEqual("text1", c2.text_features[0].name)

def test_infer_text_features(self):
c = Corpus.from_file('friends-transcripts')
tf = c.text_features
Expand Down

0 comments on commit 63bb2f2

Please sign in to comment.