// Name normalization function in PHP: splits a full name into // forename and surname, taking into account various common // surname prefixes and suffixes. // // By Jed Hartman (logos@kith.org), 11 February 2007. // For more info, see: // http://www.kith.org/journals/jed/2007/02/11/3813.html // // This code is in the public domain; no rights reserved. Use as you like. // // Parameter: Pass in a string containing a full name. // Returns: A 2-element array. First element is fore-and-middle-names; // second element is surname including prefixes and suffixes, if any. // (Note that it's impossible to distinguish between a two-word // forename and a first-name-plus-middle-name, so we don't even try.) // Initials are normalized to have periods and spaces after them. // // 2008-01-19: Cleaned up handling of periods, "Mr", and "Mrs". Note // that "Jr" and "jr" now automatically get periods added. If I want // to allow no-period versions of those, I could go through the // passed-in name and keep track of what pieces have periods, but I // don't think that's worth doing. // // 2009-10-20: Added "du". Considered adding a special case where if // a name has only two parts, then prefixes are treated as first // names, but decided against it; for example, I think "Von Foo" is // meant to be a unitary name, not first + last. So not gonna do this. // // 2010-10-14: Added "Vander". // // Bugs: // // * Doesn't handle various non-Anglo approaches to naming, as in // names like "Garcia y Lopez". To handle such cases, if you're // manually reviewing names before you call this function, you can // manually insert underscores between name elements that should // stay together: "Jaime Garcia_y_Lopez". // // * Doesn't handle cases where a surname prefix is used as a // middle name, as in names like "Joshua Ben David". Again, // you can manually insert underscores: "Joshua_Ben David". // // TODO: // // * Consider replacing my whole system with this code: // http://alphahelical.com/code/misc/nameparse/?misc/nameparse // Or with this code: // http://jasonpriem.com/human-name-parse/ // // * Consider generating capitalization and punctuation variants // for prefixes and suffixes rather than listing them all. // // * Clean up repetitive logic in middle of routine. // // * Remove other titles as well as "Dr." // // * Find a more elegant way to handle apostrophes in forenames. // // * Don't treat non-ASCII characters and parentheses as word breaks. function normalize_name($full_name) { $last_name_prefixes = array ("ben", "da", "Da", "Dal", "de", "De", "del", "Del", "den", "der", "Di", "du", "e", "la", "La", "Le", "Mc", "San", "St", "Ste", "van", "Van", "Vander", "vel", "von", "Von"); $last_name_suffixes = array ("Jr", "jr", "Sr", "sr", "2", "II", "III", "IV"); $add_periods = array ("Ste", "St", "Jr", "jr", "Sr", "sr"); $full_name = trim($full_name); $full_name = preg_replace("/]+>/", "", $full_name); // Remove x-flowed and other tags. $full_name = preg_replace("/.+$/", "", $full_name); // Remove final periods. $full_name = preg_replace("/./", " ", $full_name); // Replace periods with spaces. $full_name = preg_replace("/ +/", " ", $full_name); // Replace runs of spaces with single spaces. $all_names = preg_split("/[ xA0]/", $full_name); // Split on space or option-space. $last_name = array_pop($all_names); $second_to_last_word = array_pop($all_names); if (is_null($second_to_last_word)) { return array ($last_name, ""); // If only one name, consider it to be a "first" (personal) name. } if (in_array($last_name, $last_name_suffixes)) // Doesn't account for multiple suffixes; fix eventually, but v. rare. { $last_name = $second_to_last_word . " " . $last_name; $second_to_last_word = array_pop($all_names); } if (is_null($second_to_last_word)) { return array ($last_name, ""); // If only one name, consider it to be a "first" (personal) name. } while (in_array($second_to_last_word, $last_name_prefixes)) { $last_name = $second_to_last_word . " " . $last_name; $second_to_last_word = array_pop($all_names); } if (is_null($second_to_last_word)) { return array ($last_name, ""); // If only one name, consider it to be a "first" (personal) name. } $last_name = preg_replace("/_/", " ", $last_name); // Change underscores to spaces, for multiword last names array_push($all_names, $second_to_last_word); // Put latest unused name back on stack $first_name = join(" ", $all_names); $first_name = preg_replace("/_/", " ", $first_name); // Change underscores to spaces, for multiword first names $first_name = preg_replace("/^Dr.? ?b/", "", $first_name); // Remove "Dr." from start of name $first_name = preg_replace("/^Mrs.? ?b/", "", $first_name); // Remove "Mrs." from start of name $first_name = preg_replace("/^Mr.? ?b/", "", $first_name); // Remove "Mr." from start of name // TODO: change the above to an array of titles to be removed, and iterate through the array. // Change all initials to have periods and spaces after them. // Apostrophes cause problems with the word-boundary test, so temporarily change them. // This is inelegant; should probably come back and figure out how to do it right at some point. $first_name = preg_replace("/'/", "QXZQXZQXZ", $first_name); $first_name = preg_replace("/b([A-Z])([A-Z])(.|b)/", "$1 $2", $first_name); // Two cap letters followed by period or space $first_name = preg_replace("/b([A-Z])(.|b)/", "$1.", $first_name); // Single letter followed by period or space $first_name = preg_replace("/QXZQXZQXZ/", "'", $first_name); // Now add back in any missing periods. foreach ($add_periods as $word) { $first_name = preg_replace("/$wordb/", "$word.", $first_name); $last_name = preg_replace("/$wordb/", "$word.", $last_name); } return array ($first_name, $last_name); }