name-parser-php.html

// Name normalization function in PHP: splits a full name into
//   forename and surname, taking into account various common
//   surname prefixes and suffixes.
//
// By Jed Hartman (logos@kith.org), 11 February 2007.
// For more info, see:
// http://www.kith.org/journals/jed/2007/02/11/3813.html
//
// This code is in the public domain; no rights reserved.  Use as you like.
//
// Parameter: Pass in a string containing a full name.
// Returns: A 2-element array.  First element is fore-and-middle-names;
//   second element is surname including prefixes and suffixes, if any.
//   (Note that it's impossible to distinguish between a two-word
//   forename and a first-name-plus-middle-name, so we don't even try.)
//   Initials are normalized to have periods and spaces after them.
//
// 2008-01-19: Cleaned up handling of periods, "Mr", and "Mrs".  Note
//   that "Jr" and "jr" now automatically get periods added.  If I want
//   to allow no-period versions of those, I could go through the
//   passed-in name and keep track of what pieces have periods, but I
//   don't think that's worth doing.
//
// 2009-10-20: Added "du".  Considered adding a special case where if
//   a name has only two parts, then prefixes are treated as first
//   names, but decided against it; for example, I think "Von Foo" is
//   meant to be a unitary name, not first + last. So not gonna do this.
//
// 2010-10-14: Added "Vander".
//
// Bugs:
//
//   *  Doesn't handle various non-Anglo approaches to naming, as in
//      names like "Garcia y Lopez".  To handle such cases, if you're
//      manually reviewing names before you call this function, you can
//      manually insert underscores between name elements that should
//      stay together: "Jaime Garcia_y_Lopez".
//
//   *  Doesn't handle cases where a surname prefix is used as a
//      middle name, as in names like "Joshua Ben David".  Again,
//      you can manually insert underscores: "Joshua_Ben David".
//
// TODO:
//
//   *  Consider replacing my whole system with this code:
//      http://alphahelical.com/code/misc/nameparse/?misc/nameparse
//      Or with this code:
//      http://jasonpriem.com/human-name-parse/
//
//   *  Consider generating capitalization and punctuation variants
//      for prefixes and suffixes rather than listing them all.
//
//   *  Clean up repetitive logic in middle of routine.
//
//   *  Remove other titles as well as "Dr."
//
//   *  Find a more elegant way to handle apostrophes in forenames.
//
//   *  Don't treat non-ASCII characters and parentheses as word breaks.

function normalize_name($full_name)
{

  $last_name_prefixes = array ("ben", "da", "Da", "Dal", "de", "De", "del", "Del", "den", "der", "Di", "du", "e", "la", "La", "Le", "Mc", "San", "St", "Ste", "van", "Van", "Vander", "vel", "von", "Von");
  $last_name_suffixes = array ("Jr", "jr", "Sr", "sr", "2", "II", "III", "IV");
  $add_periods = array ("Ste", "St", "Jr", "jr", "Sr", "sr");

  $full_name = trim($full_name);
  $full_name = preg_replace("/]+>/", "", $full_name); // Remove x-flowed and other tags.
  $full_name = preg_replace("/.+$/", "", $full_name); // Remove final periods.
  $full_name = preg_replace("/./", " ", $full_name); // Replace periods with spaces.
  $full_name = preg_replace("/ +/", " ", $full_name); // Replace runs of spaces with single spaces.
  $all_names = preg_split("/[ xA0]/", $full_name); // Split on space or option-space.
  $last_name = array_pop($all_names);
  $second_to_last_word = array_pop($all_names);
  if (is_null($second_to_last_word))
  {
    return array ($last_name, "");  // If only one name, consider it to be a "first" (personal) name.
  }
  if (in_array($last_name, $last_name_suffixes))  // Doesn't account for multiple suffixes; fix eventually, but v. rare.
  {
    $last_name = $second_to_last_word . " " . $last_name;
	$second_to_last_word = array_pop($all_names);
  }
  if (is_null($second_to_last_word))
  {
    return array ($last_name, "");  // If only one name, consider it to be a "first" (personal) name.
  }
  while (in_array($second_to_last_word, $last_name_prefixes))
  {
    $last_name = $second_to_last_word . " " . $last_name;
	$second_to_last_word = array_pop($all_names);
  }
  if (is_null($second_to_last_word))
  {
    return array ($last_name, "");  // If only one name, consider it to be a "first" (personal) name.
  }
  $last_name = preg_replace("/_/", " ", $last_name); // Change underscores to spaces, for multiword last names
  array_push($all_names, $second_to_last_word); // Put latest unused name back on stack
  $first_name = join(" ", $all_names);
  $first_name = preg_replace("/_/", " ", $first_name); // Change underscores to spaces, for multiword first names
  $first_name = preg_replace("/^Dr.? ?b/", "", $first_name);  // Remove "Dr." from start of name
  $first_name = preg_replace("/^Mrs.? ?b/", "", $first_name);  // Remove "Mrs." from start of name
  $first_name = preg_replace("/^Mr.? ?b/", "", $first_name);  // Remove "Mr." from start of name
  // TODO: change the above to an array of titles to be removed, and iterate through the array.

  // Change all initials to have periods and spaces after them.
  // Apostrophes cause problems with the word-boundary test, so temporarily change them.
  // This is inelegant; should probably come back and figure out how to do it right at some point.
  $first_name = preg_replace("/'/", "QXZQXZQXZ", $first_name);
  $first_name = preg_replace("/b([A-Z])([A-Z])(.|b)/", "$1 $2", $first_name); // Two cap letters followed by period or space
  $first_name = preg_replace("/b([A-Z])(.|b)/", "$1.", $first_name);  // Single letter followed by period or space
  $first_name = preg_replace("/QXZQXZQXZ/", "'", $first_name);
  
  // Now add back in any missing periods.
  foreach ($add_periods as $word)
  {
    $first_name = preg_replace("/$wordb/", "$word.", $first_name);
    $last_name = preg_replace("/$wordb/", "$word.", $last_name);
  }
  
  return array ($first_name, $last_name);
}