// Name normalization function in PHP: splits a full name into
// forename and surname, taking into account various common
// surname prefixes and suffixes.
//
// By Jed Hartman (logos@kith.org), 11 February 2007.
// For more info, see:
// http://www.kith.org/journals/jed/2007/02/11/3813.html
//
// This code is in the public domain; no rights reserved. Use as you like.
//
// Parameter: Pass in a string containing a full name.
// Returns: A 2-element array. First element is fore-and-middle-names;
// second element is surname including prefixes and suffixes, if any.
// (Note that it's impossible to distinguish between a two-word
// forename and a first-name-plus-middle-name, so we don't even try.)
// Initials are normalized to have periods and spaces after them.
//
// 2008-01-19: Cleaned up handling of periods, "Mr", and "Mrs". Note
// that "Jr" and "jr" now automatically get periods added. If I want
// to allow no-period versions of those, I could go through the
// passed-in name and keep track of what pieces have periods, but I
// don't think that's worth doing.
//
// 2009-10-20: Added "du". Considered adding a special case where if
// a name has only two parts, then prefixes are treated as first
// names, but decided against it; for example, I think "Von Foo" is
// meant to be a unitary name, not first + last. So not gonna do this.
//
// 2010-10-14: Added "Vander".
//
// Bugs:
//
// * Doesn't handle various non-Anglo approaches to naming, as in
// names like "Garcia y Lopez". To handle such cases, if you're
// manually reviewing names before you call this function, you can
// manually insert underscores between name elements that should
// stay together: "Jaime Garcia_y_Lopez".
//
// * Doesn't handle cases where a surname prefix is used as a
// middle name, as in names like "Joshua Ben David". Again,
// you can manually insert underscores: "Joshua_Ben David".
//
// TODO:
//
// * Consider replacing my whole system with this code:
// http://alphahelical.com/code/misc/nameparse/?misc/nameparse
// Or with this code:
// http://jasonpriem.com/human-name-parse/
//
// * Consider generating capitalization and punctuation variants
// for prefixes and suffixes rather than listing them all.
//
// * Clean up repetitive logic in middle of routine.
//
// * Remove other titles as well as "Dr."
//
// * Find a more elegant way to handle apostrophes in forenames.
//
// * Don't treat non-ASCII characters and parentheses as word breaks.
function normalize_name($full_name)
{
$last_name_prefixes = array ("ben", "da", "Da", "Dal", "de", "De", "del", "Del", "den", "der", "Di", "du", "e", "la", "La", "Le", "Mc", "San", "St", "Ste", "van", "Van", "Vander", "vel", "von", "Von");
$last_name_suffixes = array ("Jr", "jr", "Sr", "sr", "2", "II", "III", "IV");
$add_periods = array ("Ste", "St", "Jr", "jr", "Sr", "sr");
$full_name = trim($full_name);
$full_name = preg_replace("/<[^>]+>/", "", $full_name); // Remove x-flowed and other tags.
$full_name = preg_replace("/\.+$/", "", $full_name); // Remove final periods.
$full_name = preg_replace("/\./", " ", $full_name); // Replace periods with spaces.
$full_name = preg_replace("/ +/", " ", $full_name); // Replace runs of spaces with single spaces.
$all_names = preg_split("/[ \xA0]/", $full_name); // Split on space or option-space.
$last_name = array_pop($all_names);
$second_to_last_word = array_pop($all_names);
if (is_null($second_to_last_word))
{
return array ($last_name, ""); // If only one name, consider it to be a "first" (personal) name.
}
if (in_array($last_name, $last_name_suffixes)) // Doesn't account for multiple suffixes; fix eventually, but v. rare.
{
$last_name = $second_to_last_word . " " . $last_name;
$second_to_last_word = array_pop($all_names);
}
if (is_null($second_to_last_word))
{
return array ($last_name, ""); // If only one name, consider it to be a "first" (personal) name.
}
while (in_array($second_to_last_word, $last_name_prefixes))
{
$last_name = $second_to_last_word . " " . $last_name;
$second_to_last_word = array_pop($all_names);
}
if (is_null($second_to_last_word))
{
return array ($last_name, ""); // If only one name, consider it to be a "first" (personal) name.
}
$last_name = preg_replace("/_/", " ", $last_name); // Change underscores to spaces, for multiword last names
array_push($all_names, $second_to_last_word); // Put latest unused name back on stack
$first_name = join(" ", $all_names);
$first_name = preg_replace("/_/", " ", $first_name); // Change underscores to spaces, for multiword first names
$first_name = preg_replace("/^Dr\.? ?\b/", "", $first_name); // Remove "Dr." from start of name
$first_name = preg_replace("/^Mrs\.? ?\b/", "", $first_name); // Remove "Mrs." from start of name
$first_name = preg_replace("/^Mr\.? ?\b/", "", $first_name); // Remove "Mr." from start of name
// TODO: change the above to an array of titles to be removed, and iterate through the array.
// Change all initials to have periods and spaces after them.
// Apostrophes cause problems with the word-boundary test, so temporarily change them.
// This is inelegant; should probably come back and figure out how to do it right at some point.
$first_name = preg_replace("/\'/", "QXZQXZQXZ", $first_name);
$first_name = preg_replace("/\b([A-Z])([A-Z])(\.|\b)/", "$1 $2", $first_name); // Two cap letters followed by period or space
$first_name = preg_replace("/\b([A-Z])(\.|\b)/", "$1.", $first_name); // Single letter followed by period or space
$first_name = preg_replace("/QXZQXZQXZ/", "'", $first_name);
// Now add back in any missing periods.
foreach ($add_periods as $word)
{
$first_name = preg_replace("/$word\b/", "$word.", $first_name);
$last_name = preg_replace("/$word\b/", "$word.", $last_name);
}
return array ($first_name, $last_name);
}