#! /usr/bin/ruby
# -*- coding: utf-8 -*-
BASEDIR = ARGV.length > 0 ? ARGV[0] : "."
DATAFILES =
[
{ :part => "noun", :path => "#{BASEDIR}/data.noun" },
{ :part => "verb", :path => "#{BASEDIR}/data.verb" },
{ :part => "adj", :path => "#{BASEDIR}/data.adj" },
{ :part => "adv", :path => "#{BASEDIR}/data.adv" },
]
OUTFILE = 'wordnet.tsv'
if !File::directory?(BASEDIR)
printf("%s is not a directory\n", BASEDIR)
exit(1)
end
seq = 0
File::open(OUTFILE, "w") do |outfile|
DATAFILES.each do |info|
part = info[:part]
path = info[:path]
File::open(path) do |infile|
infile.each do |line|
line.force_encoding('UTF-8')
next if line.start_with?(" ")
line = line.strip
head = line.sub(/ *\|.*/, "")
head = head.sub(/ *\@.*/, "")
fields = head.split(" ")
next if fields.length < 4
pivot = fields[3].hex * 2
next if pivot + 4 > fields.length
fields = fields[4..3+pivot]
faces = []
for i in (0...(fields.length))
faces.push(fields[i]) if i % 2 == 0
end
text = line.sub(/.*\| */, "")
faces.each do |face|
face = face.gsub(/_/, " ")
face = face.gsub(/\s+/, " ")
key = face.downcase
seq += 1
printf(outfile, "%s\t%d\t%s\t%s\t%s\n", key, seq, face, part, text)
printf("%s: %d records done\n", $0, seq) if seq % 1000 == 0
end
end
end
end
end