Skip to content

Commit

Permalink
Added scripts.
Browse files Browse the repository at this point in the history
  • Loading branch information
blackwinter committed Sep 9, 2009
1 parent 2266f7d commit df99636
Show file tree
Hide file tree
Showing 2 changed files with 165 additions and 0 deletions.
77 changes: 77 additions & 0 deletions scripts/dbm2syn.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#! /usr/bin/ruby

#--
###############################################################################
# #
# dbm2syn -- Convert DBM to Lingo dictionary for perseus-a #
# #
# Copyright (C) 2009 Cologne University of Applied Sciences, #
# Claudiusstr. 1, #
# 50678 Cologne, Germany #
# #
# Authors: #
# Jens Wille <[email protected]> #
# #
# dbm2syn is free software; you can redistribute it and/or modify it under #
# the terms of the GNU General Public License as published by the Free #
# Software Foundation; either version 3 of the License, or (at your option) #
# any later version. #
# #
# dbm2syn is distributed in the hope that it will be useful, but WITHOUT ANY #
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
# details. #
# #
# You should have received a copy of the GNU General Public License along #
# with dbm2syn. If not, see <http://www.gnu.org/licenses/>. #
# #
###############################################################################
#++

abort "Usage: #{$0} <dbm> <syn> [<key>]" unless [2, 3].include?(ARGV.size)

STDOUT.sync = true

rec, syn = {}, Hash.new { |h, k| h[k] = [] }

ID_RE = %r{\AID:(.*)}
KEY_RE = %r{\A(#{ARGV[2] || '.*?'}):(.*)}
REC_RE = %r{\A&&&\z}
SEP_RE = %r{\*}

File.foreach(ARGV[0]) { |line|
print '.' if $. % 10_000 == 0

case line.chomp
when ID_RE
rec[:id] = $1
when KEY_RE
(rec[$1] ||= []) << $2
when REC_RE
if id = rec.delete(:id)
rec.values.each { |value|
warn "#{id}: #{value}" if value =~ SEP_RE

syn[value] << id
}
end

rec = {}
end
}

puts

syn.sort!

puts

File.open(ARGV[1], 'w') { |f|
syn.each_with_index { |(value, ids), index|
print '.' if index % 1_000 == 0

f.puts "#{value}*#{ids.sort.uniq.join('|')}"
}
}

puts
88 changes: 88 additions & 0 deletions scripts/merge_syn.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#! /usr/bin/ruby

#--
###############################################################################
# #
# merge_syn -- Intersection of Lingo results for perseus-a #
# #
# Copyright (C) 2009 Cologne University of Applied Sciences, #
# Claudiusstr. 1, #
# 50678 Cologne, Germany #
# #
# Authors: #
# Jens Wille <[email protected]> #
# #
# merge_syn is free software; you can redistribute it and/or modify it under #
# the terms of the GNU General Public License as published by the Free #
# Software Foundation; either version 3 of the License, or (at your option) #
# any later version. #
# #
# merge_syn is distributed in the hope that it will be useful, but WITHOUT #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
# more details. #
# #
# You should have received a copy of the GNU General Public License along #
# with merge_syn. If not, see <http://www.gnu.org/licenses/>. #
# #
###############################################################################
#++

abort "Usage: #{$0} <in.syn>... <out.syn>" unless ARGV.size >= 2

STDOUT.sync = true

KEY_SEPARATOR = '*'.freeze
VALUE_SEPARATOR = '|'.freeze

merge = Hash.new { |h, k| h[k] = [] }
outfile = ARGV.pop

# first run sets the basis!
File.foreach(ARGV.shift) { |line|
print '.' if $. % 1_000 == 0

line.chomp!

key, values = line.split(KEY_SEPARATOR, 2)
merge[key] = values.split(VALUE_SEPARATOR)
}

keys = merge.keys

puts

ARGV.each { |syn|
_keys = []

File.foreach(syn) { |line|
print '.' if $. % 1_000 == 0

line.chomp!

key, values = line.split(KEY_SEPARATOR, 2)
merge[key] &= values.split(VALUE_SEPARATOR)

_keys << key
}

keys &= _keys

puts
}

merge.delete_if { |key, values|
values.empty? || !keys.include?(key)
}

puts

File.open(outfile, 'w') { |f|
merge.sort.each_with_index { |(key, values), i|
print '.' if i % 1_000 == 0

f.puts "#{key}#{KEY_SEPARATOR}#{values.sort.join(VALUE_SEPARATOR)}"
}
}

puts

0 comments on commit df99636

Please sign in to comment.