Skip to content

Commit 434e00b

Browse files
committed
Fighting conversion errors in PDTSC.
1 parent c315d68 commit 434e00b

File tree

3 files changed

+81
-1
lines changed

3 files changed

+81
-1
lines changed

lib/Treex/Block/HamleDT/CS/FixUD.pm

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1293,6 +1293,7 @@ BEGIN
12931293
['na téma', 'fixed', 'na téma', 'ADP NOUN', 'RR--4---------- NNNS4-----A----', 'pos=adp|adpostype=prep|case=acc|extpos=adp pos=noun|nountype=com|gender=neut|number=sing|case=acc', '0:case 1:fixed'],
12941294
['na způsob', 'fixed', 'na způsob', 'ADP NOUN', 'RR--4---------- NNIS4-----A----', 'pos=adp|adpostype=prep|case=acc|extpos=adp pos=noun|nountype=com|gender=masc|animacy=inan|number=sing|case=acc', '0:case 1:fixed'],
12951295
['s ohledem na', 'always', 's ohled na', 'ADP NOUN ADP', 'RR--7---------- NNIS7-----A---- RR--4----------', 'pos=adp|adpostype=prep|case=ins|extpos=adp pos=noun|nountype=com|gender=masc|animacy=inan|number=sing|case=ins pos=adp|adpostype=prep|case=acc', '0:case 1:fixed 1:fixed'],
1296+
['s výjimkou', 'fixed', 's výjimka', 'ADP NOUN', 'RR--7---------- NNFS7-----A----', 'pos=adp|adpostype=prep|case=ins|extpos=adp pos=noun|nountype=com|gender=fem|number=sing|case=ins', '0:case 1:fixed'],
12961297
['v době', 'fixed', 'v doba', 'ADP NOUN', 'RR--6---------- NNFS6-----A----', 'pos=adp|adpostype=prep|case=loc|extpos=adp pos=noun|nountype=com|gender=fem|number=sing|case=loc', '0:case 1:fixed'],
12971298
['v případě', 'fixed', 'v případ', 'ADP NOUN', 'RR--6---------- NNIS6-----A----', 'pos=adp|adpostype=prep|case=loc|extpos=adp pos=noun|nountype=com|gender=masc|animacy=inan|number=sing|case=loc', '0:case 1:fixed'],
12981299
['z hlediska', 'fixed', 'z hledisko', 'ADP NOUN', 'RR--2---------- NNNS2-----A----', 'pos=adp|adpostype=prep|case=gen|extpos=adp pos=noun|nountype=com|gender=neut|number=sing|case=gen', '0:case 1:fixed'],
@@ -2750,6 +2751,13 @@ sub fix_annotation_errors
27502751
my @subtree = $self->get_node_subtree($node);
27512752
$subtree[4]->set_deprel('parataxis');
27522753
}
2754+
# PDT-C 2.0 dev tamw pdtsc_046_1.02 # 19
2755+
elsif($spanstring =~ m/^, de facto ne se všemi ,$/)
2756+
{
2757+
my @subtree = $self->get_node_subtree($node);
2758+
$subtree[4]->set_parent($subtree[5]);
2759+
$subtree[4]->set_deprel('case');
2760+
}
27532761
# Make sure that no node has more than one subject. This is to prevent
27542762
# validation errors in UD. However, instead of randomly picking a subject
27552763
# and re-labeling it as dep, we should investigate and fix the error

lib/Treex/Block/HamleDT/CS/HarmonizePDTC.pm

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ sub process_zone
2424
my $zone = shift;
2525
my $root = $self->SUPER::process_zone($zone);
2626
$self->revert_multiword_preps_to_auxp($root);
27+
$self->prevent_compound_subordinators($root);
2728
return $root;
2829
}
2930

@@ -209,6 +210,57 @@ sub revert_multiword_preps_to_auxp
209210

210211

211212

213+
#------------------------------------------------------------------------------
214+
# An AuxY dependent of an AuxC node could mean a compound subordinator, or it
215+
# could be just two subordinators co-occurring at the same subordinate clause.
216+
# Subsequent conversion to UD is likely to favor the compound interpretation.
217+
# In some cases we know it is undesired (language-specific word pairs, or
218+
# simply the fact that the two words are not adjacent). We can prevent the
219+
# compound reading by re-attaching the dependent word to the argument of the
220+
# main subordinator, i.e., to the predicate of the subordinate clause.
221+
#------------------------------------------------------------------------------
222+
BEGIN
223+
{
224+
# List of unwanted word pairs (first the parent, then the child).
225+
# It is constant, thus prepared at compile time and available globally.
226+
@Treex::Block::HamleDT::CS::HarmonizePDTC::non_compound_subordinators =
227+
(
228+
['jako', 'čili'],
229+
['že', 'že']
230+
);
231+
}
232+
sub prevent_compound_subordinators
233+
{
234+
my $self = shift;
235+
my $root = shift;
236+
my @nodes = $root->get_descendants({'ordered' => 1});
237+
foreach my $node (@nodes)
238+
{
239+
if($node->deprel() eq 'AuxY' && $node->parent()->deprel() eq 'AuxC')
240+
{
241+
# Do we want to prevent the interpretation of this word pair as a compound subordinator?
242+
my $lcpf = lc($node->parent()->form());
243+
my $lccf = lc($node->form());
244+
if(any {$lcpf eq $_->[0] && $lccf eq $_->[1]} (@Treex::Block::HamleDT::CS::HarmonizePDTC::non_compound_subordinators))
245+
{
246+
# Yes, we want to prevent it. Find a sibling suitable to play the new parent.
247+
my @siblings = grep {$_->deprel() !~ m/^Aux[GXY]$/} ($node->get_siblings({'ordered' => 1}));
248+
my $n = scalar(@siblings);
249+
if($n > 1)
250+
{
251+
log_warn("Trying to reattach AuxY from AuxC to its argument, found $n parent candidates.");
252+
}
253+
if($n > 0)
254+
{
255+
$node->set_parent($siblings[-1]);
256+
}
257+
}
258+
}
259+
}
260+
}
261+
262+
263+
212264
#------------------------------------------------------------------------------
213265
# Catches possible annotation inconsistencies. This method is called from
214266
# SUPER->process_zone() after convert_tags(), fix_morphology(), and
@@ -1003,6 +1055,21 @@ sub fix_annotation_errors
10031055
$subtree[1]->set_parent($subtree[2]);
10041056
$subtree[1]->set_is_extra_dependency(undef);
10051057
}
1058+
# PDT-C 2.0 dev tamw pdtsc_014_2.03 # 17
1059+
# A comma should not be member of Coord unless it is itself a head of Coord.
1060+
elsif($spanstring =~ m/^, čtyři z vrchu , dvě z boku ,$/)
1061+
{
1062+
my @subtree = $self->get_node_subtree($node);
1063+
$subtree[0]->set_is_member(undef);
1064+
}
1065+
# PDT-C 2.0 dev tamw pdtsc_097_2.04 # 1
1066+
elsif($spanstring =~ m/^už jsem nebyl asi tak rychlý nebo přece jenom tak obratný$/)
1067+
{
1068+
my @subtree = $self->get_node_subtree($node);
1069+
$subtree[8]->set_parent($subtree[10]);
1070+
$subtree[8]->set_is_member(undef);
1071+
$subtree[8]->set_is_extra_dependency(undef);
1072+
}
10061073
}
10071074
}
10081075

lib/Treex/Block/HamleDT/PragueDeprelsToUD.pm

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -506,10 +506,15 @@ sub convert_deprels
506506
# The AuxY deprel is used in various situations, see below.
507507
elsif($deprel eq 'AuxY')
508508
{
509+
# An AuxY depending on an AuxC may signal a multiword subordinator, which could be converted to mark and later to fixed.
510+
# However, some combinations are not fixed expressions. This could be decided using language-specific lists; another clue
511+
# is that if the words are not adjacent, they probably do not form a compound subordinator.
512+
###!!! See also HamleDT::CS::HarmonizePDTC::prevent_compound_subordinators();
509513
# When it is attached to a subordinating conjunction (AuxC), the two form a multi-word subordinator.
510514
# Index Thomisticus examples: ita quod (so that), etiam si (even if), quod quod (what is that), ac si (as if), et si (although)
511515
###!!! But not always! E.g., we had an apposition "jako X, čili jako Y", it was hamledtized as a hypotactic structure, then "čili" ended up attached as AuxY to "jako", but they do not form a fixed compound subordinator!
512-
if($parent->wild()->{prague_deprel} eq 'AuxC' && lc($node->form()) ne 'čili')
516+
###!!! Also not for double "že": "Myslí si, že když to máme za humny, že na to je dost času."
517+
if($parent->wild()->{prague_deprel} eq 'AuxC' && lc($node->form()) ne 'čili' && $node->form() !~ m/^(čili|že)$/i) ###!!! raději bych měl testovat formu rodiče i dítěte, abych zase nevyhodil něco, co vyhodit nechci!
513518
{
514519
# The phrase builder will later transform it to MWE.
515520
$deprel = 'mark';

0 commit comments

Comments
 (0)