// Name normalization function in PHP: splits a full name into
// forename and surname, taking into account various common
// surname prefixes and suffixes.
//
// By Jed Hartman (logos@kith.org), 11 February 2007.
// For more info, see:
// http://www.kith.org/journals/jed/2007/02/11/3813.html
//
// This code is in the public domain; no rights reserved. Use as you like.
//
// Parameter: Pass in a string containing a full name.
// Returns: A 2-element array. First element is fore-and-middle-names;
// second element is surname including prefixes and suffixes, if any.
// (Note that it's impossible to distinguish between a two-word
// forename and a first-name-plus-middle-name, so we don't even try.)
// Initials are normalized to have periods and spaces after them.
//
// Bugs:
//
// * Doesn't handle various non-Anglo approaches to naming, as in
// names like "Garcia y Lopez". To handle such cases, if you're
// manually reviewing names before you call this function, you can
// manually insert underscores between name elements that should
// stay together: "Jaime Garcia_y_Lopez".
//
// * Doesn't handle cases where a surname prefix is used as a
// middle name, as in names like "Joshua Ben David". Again,
// you can manually insert underscores: "Joshua_Ben David".
//
// TODO:
//
// * Consider generating capitalization and punctuation variants
// for prefixes and suffixes rather than listing them all.
//
// * Clean up repetitive logic in middle of routine.
//
// * Remove other titles as well as "Dr."
//
// * Find a more elegant way to handle apostrophes in forenames.
function normalize_name($full_name)
{
$last_name_prefixes = array ("da", "Da", "Dal", "de", "De", "del", "der", "Di", "e", "la", "La", "Le", "San", "St.", "Ste.", "van", "Van", "vel", "von");
$last_name_suffixes = array ("Jr.", "jr.", "Jr", "jr", "Sr.", "2", "II", "III", "IV");
$full_name = trim($full_name);
$all_names = preg_split("/[ \xA0]/", $full_name); // Split on space or option-space.
$last_name = array_pop($all_names);
$second_to_last_word = array_pop($all_names);
if (is_null($second_to_last_word))
{
return array ($last_name, ""); // If only one name, consider it to be a "first" (personal) name.
}
if (in_array($last_name, $last_name_suffixes)) // Doesn't account for multiple suffixes; fix eventually, but v. rare.
{
$last_name = $second_to_last_word . " " . $last_name;
$second_to_last_word = array_pop($all_names);
}
if (is_null($second_to_last_word))
{
return array ($last_name, ""); // If only one name, consider it to be a "first" (personal) name.
}
while (in_array($second_to_last_word, $last_name_prefixes))
{
$last_name = $second_to_last_word . " " . $last_name;
$second_to_last_word = array_pop($all_names);
}
if (is_null($second_to_last_word))
{
return array ($last_name, ""); // If only one name, consider it to be a "first" (personal) name.
}
$last_name = preg_replace("/_/", " ", $last_name); // Change underscores to spaces, for multiword last names
array_push($all_names, $second_to_last_word); // Put latest unused name back on stack
$first_name = join(" ", $all_names);
$first_name = preg_replace("/_/", " ", $first_name); // Change underscores to spaces, for multiword first names
$first_name = preg_replace("/^Dr\.? ?/", "", $first_name); // Remove "Dr." from start of name
// Change all initials to have periods and spaces after them.
// Apostrophes cause problems with the word-boundary test, so temporarily change them.
// This is inelegant; should probably come back and figure out how to do it right at some point.
$first_name = preg_replace("/\'/", "QXZQXZQXZ", $first_name);
$first_name = preg_replace("/\b([A-Z])(\.|\b)/", "$1.", $first_name);
$first_name = preg_replace("/\b([A-Z])\.?([A-Z])(\.|\b)/", "$1. $2.", $first_name);
$first_name = preg_replace("/QXZQXZQXZ/", "'", $first_name);
return array ($first_name, $last_name);
}