Complex breakdown of author strings. Handles a wide variety of formats.
See test_splitter_tokens.rb for scope. As with AuthorYear this will match just about anything when used alone. Add exceptions at will, just test using TestSplittTokens#test_authors. TODO: Unicode the [a-z] bits?
# File lib/splitter/tokens.rb, line 76 def initialize(input) str = input @names = [] str.strip! naked_and = false # look for the pattern 'Foo, Bar and Smith', i.e. no initials individuals = [] last_individual = nil # We can simplify if there is an "and" or & if str =~ %r(\s+and\s+|\&)/ l,r = str.split(%r\s+\,?\s*and\s+|\s+\&\s+/, 2) # added \, \s+ last_individual = r str = l naked_and = true end # Look for an exception case, no initials, "and" or "&" previously present, like: # Foo, Bar and Smith if naked_and && not(str =~ %r\./) && str =~ %rs*([A-Z][a-z]{1,})\s*\,+\s*([A-Z][a-z]{1,})/ individuals.unshift str.split(%r\s*\,\s*/) str = nil end # Look for an exception case, no periods and multiple commas, like: # Foo A, Bar ZA, Smith-Blorf A if str && !naked_and && (str.split(",").size > 2) && !(str =~ %r\./) individuals = str.split(",") str = nil end prefix = ['van den ', 'Van ', "O'", "Mc", 'Campos ', 'Costa '] pre_reg = prefix.collect{|p| "(#{Regexp.escape(p)})?"}.join postfix = ['de la', 'von', 'da', 'van', ', Jr.'] post_reg = postfix.collect{|p| "(#{Regexp.escape(p)})?"}.join # Initials second m1 = Regexp.new(%r^\s*(#{pre_reg} # legal prefix words, includes space if present [A-Z][a-z]+ # a captialized Name (\-[A-Z][a-z]+)? # optional dashed addition \s*,\s* # required comma (\s* # initials, optionally surrounded by whitescape (\-)? # optional preceeding dash, hits second initials [A-Z] # required capital initial (\-)? # optional initial dash (\-[A-Z])? # optional dashed initial \s*\. # required period \s*) {1,} # repeat initials as necessary #{post_reg}) # optional legal postfixes \s*/) # Initials first m2 = Regexp.new(%r^\s*(([A-Z]\.\s*){1,}#{pre_reg}[A-Z][a-z]+#{post_reg}),/) # (R. Watson | R.F. Watson), # pick off remaining authors one at a time if str parsing = true i = 0 while parsing individual = '' check_for_more_individuals = false [m2, m1].each do |regex| if str =~ regex individual = $1 str.slice!(individual) str.strip! str.slice!(",") individuals.push(individual) check_for_more_individuals = true # at least once match, keep going end end # puts "[#{individual}] : #{str}" if !check_for_more_individuals if str && str.size != 0 individuals.push(str) parsing = false end end i += 1 raise if i > 100 parsing = false if str.size == 0 end end # Note to remember positive look behind (?<= ) for future hax # str.split(/(?<=[A-Z])\s*[;,]{1}\s*/, 2) individuals.push(last_individual) if !last_individual.nil? individuals.flatten! # At this point we have isolated individuals. Strategy is to slice out initials and remainder is last name. # Initials regex matches any A-B. A. or " A ", "A-B" pattern (including repeats) # TODO: Make a Token match_initials = Regexp.new(%r(((\s((\-)?[A-Z](\-[A-Z])?\s?){1,})$)|(((\-)?[A-Z](\-[A-Z|a-z]\s*)?\.\s*){1,})|(\s((\-)?[A-Z](\-[A-Z])?\s){1,}))/) # TODO: merge with pre/postfix list suffixes = [ Regexp.new(%r\s(van)\s?/), Regexp.new(%r\s(jr\.)/), Regexp.new(%r\s(von)\s?/), Regexp.new(%r\s(de la)\s?/), Regexp.new(%r\s(da)\s?/), ] individuals.each do |i| a = {} # new author initials = nil last_name = nil if i =~ match_initials initials = $1 i.slice!(initials) i.strip! last_name = i else last_name = i end suffix = [] suffixes.each do |s| # .collect{|p| Regexp.escape(p)}.each do |s| if last_name =~ s t = $1 suffix.push(t) last_name.slice!(t) end end a[:suffix] = suffix.join(" ") if suffix.size > 0 last_name.gsub!(%r\.|\,/, '') a[:last_name] = last_name.strip if last_name # "if" not fully tested for consequences a[:initials] = initials.strip.split(%r\s|\./).collect{|v| v.strip}.select{|x| x.size > 0} if initials && initials.size > 0 @names << a end end