class Taxonifi::Splitter::Tokens::Authors

Complex breakdown of author strings. Handles a wide variety of formats.

See test_splitter_tokens.rb for scope. As with AuthorYear this will match just about anything when used alone. Add exceptions at will, just test using TestSplittTokens#test_authors. TODO: Unicode the [a-z] bits?

Attributes

names[R]

Public Class Methods

new(input) click to toggle source
# File lib/splitter/tokens.rb, line 76
def initialize(input)
  str = input 
  @names = [] 
  str.strip!
  naked_and = false # look for the pattern 'Foo, Bar and Smith', i.e. no initials
  individuals = []
  last_individual = nil

  # We can simplify if there is an "and" or & 
  if str =~ %r(\s+and\s+|\&)/
    l,r = str.split(%r\s+\,?\s*and\s+|\s+\&\s+/, 2) # added \, \s+
    last_individual = r
    str = l  
    naked_and = true
  end

  # Look for an exception case, no initials, "and" or "&" previously present, like:
  #   Foo, Bar and Smith  
  if naked_and && not(str =~ %r\./) && str =~ %rs*([A-Z][a-z]{1,})\s*\,+\s*([A-Z][a-z]{1,})/ 
    individuals.unshift str.split(%r\s*\,\s*/)
    str = nil 
  end

  # Look for an exception case, no periods and multiple commas, like:
  #   Foo A, Bar ZA, Smith-Blorf A
  if str && !naked_and && (str.split(",").size > 2) && !(str =~ %r\./)
    individuals = str.split(",")
    str = nil
  end

  prefix = ['van den ', 'Van ', "O'", "Mc", 'Campos ', 'Costa ']
  pre_reg = prefix.collect{|p| "(#{Regexp.escape(p)})?"}.join

  postfix = ['de la', 'von', 'da', 'van', ', Jr.'] 
  post_reg = postfix.collect{|p| "(#{Regexp.escape(p)})?"}.join

  # Initials second
  m1 = Regexp.new(%r^\s*(#{pre_reg}             # legal prefix words, includes space if present
                        [A-Z][a-z]+            # a captialized Name 
                        (\-[A-Z][a-z]+)?       # optional dashed addition
                        \s*,\s*                # required comma
                        (\s*                   #  initials, optionally surrounded by whitescape
                         (\-)?                 # optional preceeding dash, hits second initials 
                         [A-Z]                 # required capital initial
                         (\-)?                 # optional initial dash   
                         (\-[A-Z])?            # optional dashed initial
                        \s*\.                  # required period
                        \s*)              
                        {1,}                   # repeat initials as necessary
                        #{post_reg})           # optional legal postfixes
                    \s*/)

  # Initials first
  m2 = Regexp.new(%r^\s*(([A-Z]\.\s*){1,}#{pre_reg}[A-Z][a-z]+#{post_reg}),/)  #  (R. Watson | R.F. Watson),

  # pick off remaining authors one at a time 
  if str
    parsing = true
    i = 0
    while parsing
      individual = ''
      check_for_more_individuals = false
      [m2, m1].each do |regex|
        if str =~ regex
          individual = $1
          str.slice!(individual)
          str.strip!
          str.slice!(",")
          individuals.push(individual)
          check_for_more_individuals = true # at least once match, keep going
        end
      end

      # puts "[#{individual}] : #{str}"
      if !check_for_more_individuals
        if str && str.size != 0
          individuals.push(str)
          parsing = false
        end
      end

      i += 1
      raise if i > 100
      parsing = false if str.size == 0
    end
  end

  # Note to remember positive look behind (?<= ) for future hax
  # str.split(/(?<=[A-Z])\s*[;,]{1}\s*/, 2)

  individuals.push(last_individual) if !last_individual.nil?
  individuals.flatten!

  # At this point we have isolated individuals.  Strategy is to slice out initials and remainder is last name.
  # Initials regex matches any A-B. A. or " A ", "A-B" pattern (including repeats) 
  # TODO: Make a Token
  match_initials = Regexp.new(%r(((\s((\-)?[A-Z](\-[A-Z])?\s?){1,})$)|(((\-)?[A-Z](\-[A-Z|a-z]\s*)?\.\s*){1,})|(\s((\-)?[A-Z](\-[A-Z])?\s){1,}))/)

  # TODO: merge with pre/postfix list
  suffixes = [
    Regexp.new(%r\s(van)\s?/),
    Regexp.new(%r\s(jr\.)/),
    Regexp.new(%r\s(von)\s?/),
    Regexp.new(%r\s(de la)\s?/),
    Regexp.new(%r\s(da)\s?/),
  ]

  individuals.each do |i|
    a = {}  # new author

    initials = nil
    last_name = nil
    if i =~ match_initials
      initials = $1
      i.slice!(initials)
      i.strip! 
      last_name = i
    else
      last_name = i
    end

    suffix = [] 
    suffixes.each do |s| # .collect{|p| Regexp.escape(p)}.each do |s|
      if last_name =~ s
        t = $1 
        suffix.push(t) 
        last_name.slice!(t)
      end
    end
    a[:suffix] = suffix.join(" ") if suffix.size > 0 

    last_name.gsub!(%r\.|\,/, '')

    a[:last_name] = last_name.strip if last_name # "if" not fully tested for consequences
    a[:initials] = initials.strip.split(%r\s|\./).collect{|v| v.strip}.select{|x| x.size > 0} if initials && initials.size > 0

    @names << a
  end
end