Blob Blame History Raw
#! /usr/bin/ruby
# -*- coding: utf-8 -*-

BASEDIR = ARGV.length > 0 ? ARGV[0] : "."
DATAFILES =
  [
   { :part => "noun", :path => "#{BASEDIR}/data.noun" },
   { :part => "verb", :path => "#{BASEDIR}/data.verb" },
   { :part => "adj", :path => "#{BASEDIR}/data.adj" },
   { :part => "adv", :path => "#{BASEDIR}/data.adv" },
  ]
OUTFILE = 'wordnet.tsv'

if !File::directory?(BASEDIR)
  printf("%s is not a directory\n", BASEDIR)
  exit(1)
end

seq = 0
File::open(OUTFILE, "w") do |outfile|
  DATAFILES.each do |info|
    part = info[:part]
    path = info[:path]
    File::open(path) do |infile|
      infile.each do |line|
        line.force_encoding('UTF-8')
        next if line.start_with?(" ")
        line = line.strip
        head = line.sub(/ *\|.*/, "")
        head = head.sub(/ *\@.*/, "")
        fields = head.split(" ")
        next if fields.length < 4
        pivot = fields[3].hex * 2
        next if pivot + 4 > fields.length
        fields = fields[4..3+pivot]
        faces = []
        for i in (0...(fields.length))
          faces.push(fields[i]) if i % 2 == 0
        end
        text = line.sub(/.*\| */, "")
        faces.each do |face|
          face = face.gsub(/_/, " ")
          face = face.gsub(/\s+/, " ")
          key = face.downcase
          seq += 1
          printf(outfile, "%s\t%d\t%s\t%s\t%s\n", key, seq, face, part, text)
          printf("%s: %d records done\n", $0, seq) if seq % 1000 == 0
        end
      end
    end
  end
end