This repository has been archived by the owner on Feb 14, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsemi-lexicalise.pl
125 lines (111 loc) · 2.13 KB
/
semi-lexicalise.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/perl
use warnings;
use strict;
use utf8;
use XML::Parser;
my $parser = XML::Parser->new(Handlers => {Start=>\&handle_start});
$parser->parsefile($ARGV[0]) or die "$!\n";
my $input;
my $reading = 0;
if($ARGV[1]) {
open($input, "<$ARGV[1]") or die "$!\n";
} else {
$input = *STDIN;
}
binmode $input, ":utf8";
binmode STDOUT, ":utf8";
# Set this to 1 to normalise unknowns
my $normalise_unk = 0;
my @tags;
while (<$input>) {
chomp;
# Trim leading & tailing whitespace
s/^\s*//;
s/\s*$//;
if (/^$/) {
print "\n";
next;
}
# This is supposed to be one token per line,
# so dump the begin and end markers and check for them
s/^\^//;
s/\$$//;
die "Input should be one token per line" if (/[\^\$]/);
# Throw away any escaped slashes
s/\\\///;
# Unknown words
if (/\*/) {
s/^[^\/]*\///;
if ($normalise_unk) {
print "UNK\n";
} else {
print "$_\n";
}
next;
}
my $re = '(' . join("|", @tags) . ')';
if (/$re/) {
my $tail = "";
if (s/(#.*)$//) {
$tail = $1;
}
s/^[^\/]*\///;
if (/\+/) {
my @each = split/\+/;
my @out;
if ($tail ne '') {
$each[0] =~ s/^([^<]*)/$1$tail/;
}
for my $e (@each) {
$e =~ s/ /_/g;
if ($e !~ /$re/) {
$e =~ s/^[^<]*</</;
}
push @out, $e;
}
print join('+', @out) . "\n";
} else {
if (/$re/) {
s/ /_/g;
print "$_\n";
} else {
s/^[^<]*</</;
print "$_\n";
}
}
} else {
s/^[^\/]*\///;
s/^[^<]*</</;
s/#.*$//;
print "$_\n";
}
}
sub handle_start {
my ($expat, $element, %attrs) = @_;
my $lemma = "";
my $lemmatail = "";
if ($element eq 'source') {
$reading = 1;
}
if ($element eq 'lexicalized-word' && $reading) {
if($attrs{'lemma'} && $attrs{'lemma'} ne '') {
if ($attrs{'lemma'} =~ /#/) {
my @parts = split/#/, $attrs{'lemma'};
$lemmatail = '#' . pop(@parts);
$lemma = join('#', @parts);
} else {
$lemma = $attrs{'lemma'};
}
}
my @tmptags = split(/\./,$attrs{'tags'});
my $regex = "";
for my $tag (@tmptags) {
if($tag eq '*') {
$regex .= '(?:<[^>]*>)+';
} else {
$regex .= '<' . $tag . '>';
}
}
push(@tags, $lemma . $regex . $lemmatail);
}
}