-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
Update originality mechanism
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
include *.md | ||
include prenoms/dist.noms.txt prenoms/dist.prenoms.txt | ||
include prenoms/data/*.txt | ||
include licenses/names.txt |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
# INSEE Prénoms header | ||
# sexe;preusuel;annais;nombre | ||
# 1;A;1980;3 | ||
|
||
import os.path | ||
import operator | ||
import argparse | ||
|
||
C_1900 = 0 | ||
C_1911 = 1 | ||
C_1921 = 2 | ||
C_1931 = 3 | ||
C_1941 = 4 | ||
C_1951 = 5 | ||
C_1961 = 6 | ||
C_1971 = 7 | ||
C_1981 = 8 | ||
C_1991 = 9 | ||
C_2001 = 10 | ||
C_2011 = 11 | ||
|
||
|
||
def get_constant(year: int): | ||
if year < 1911: | ||
return C_1900 | ||
if year < 1921: | ||
return C_1911 | ||
if year < 1931: | ||
return C_1921 | ||
if year < 1941: | ||
return C_1931 | ||
if year < 1951: | ||
return C_1941 | ||
if year < 1961: | ||
return C_1951 | ||
if year < 1971: | ||
return C_1961 | ||
if year < 1981: | ||
return C_1971 | ||
if year < 1991: | ||
return C_1981 | ||
if year < 2001: | ||
return C_1991 | ||
if year < 2011: | ||
return C_2001 | ||
return C_2011 | ||
|
||
|
||
def year_to_string(i: int): | ||
if i == C_1900: | ||
return '1900' | ||
if i == C_1911: | ||
return '1911' | ||
if i == C_1921: | ||
return '1921' | ||
if i == C_1931: | ||
return '1931' | ||
if i == C_1941: | ||
return '1941' | ||
if i == C_1951: | ||
return '1951' | ||
if i == C_1961: | ||
return '1961' | ||
if i == C_1971: | ||
return '1971' | ||
if i == C_1981: | ||
return '1981' | ||
if i == C_1991: | ||
return '1991' | ||
if i == C_2001: | ||
return '2001' | ||
return '2011' | ||
|
||
|
||
def gender_to_string(i: int): | ||
if i == 0: | ||
return 'm' | ||
return 'f' | ||
|
||
|
||
def increment_dict(d: dict, key, increment): | ||
if key in d.keys(): | ||
d[key] += increment | ||
else: | ||
d[key] = increment | ||
|
||
|
||
def parse_insee(insee_file, output_folder): | ||
distribs = [{C_1900: {}, C_1911: {}, C_1921: {}, C_1931: {}, C_1941: {}, C_1951: {}, | ||
C_1961: {}, C_1971: {}, C_1981: {}, C_1991: {}, C_2001: {}, C_2011: {}, 'all': {}}, | ||
{C_1900: {}, C_1911: {}, C_1921: {}, C_1931: {}, C_1941: {}, C_1951: {}, | ||
C_1961: {}, C_1971: {}, C_1981: {}, C_1991: {}, C_2001: {}, C_2011: {}, 'all': {}} | ||
] | ||
max_ids = [{C_1900: [], C_1911: [], C_1921: [], C_1931: [], C_1941: [], C_1951: [], | ||
C_1961: [], C_1971: [], C_1981: [], C_1991: [], C_2001: [], C_2011: [], 'all': {}}, | ||
{C_1900: [], C_1911: [], C_1921: [], C_1931: [], C_1941: [], C_1951: [], | ||
C_1961: [], C_1971: [], C_1981: [], C_1991: [], C_2001: [], C_2011: [], 'all': {}} | ||
] | ||
with open(insee_file, mode='r', encoding='utf-8') as f: | ||
first_line = True | ||
for line in f: | ||
if first_line: | ||
first_line = False | ||
continue | ||
line = line.strip() | ||
infos = line.split(';') | ||
if not infos or infos[2] == 'XXXX': | ||
continue | ||
if infos[1] == "_PRENOMS_RARES": | ||
continue | ||
gender = int(infos[0]) - 1 | ||
name = infos[1].title() | ||
year = int(infos[2]) | ||
count = int(infos[3]) | ||
increment_dict(distribs[gender][get_constant(year)], name, count) | ||
increment_dict(distribs[gender]['all'], name, count) | ||
|
||
for gender in [0, 1]: | ||
genred_distribs = distribs[gender] | ||
for k in genred_distribs.keys(): | ||
to_list = sorted(genred_distribs[k].items(), key=operator.itemgetter(1), reverse=True) | ||
genred_distribs[k] = to_list | ||
if k != 'all': | ||
genred_distribs[k] = [e for e in genred_distribs[k] if e[1] > 10] | ||
|
||
max_id_common = 0 | ||
max_id_uncommon = 0 | ||
max_id_rare = 0 | ||
max_id = len(genred_distribs[k]) | ||
for i in range(0, len(genred_distribs[k])): | ||
c = genred_distribs[k][i][1] # Name occurence in the distrib | ||
if c < 500 and max_id_common == 0: | ||
max_id_common = i | ||
if c < 100 and max_id_uncommon == 0: | ||
max_id_uncommon = i | ||
if c < 50 and max_id_rare == 0: | ||
max_id_rare = i | ||
break | ||
max_ids[gender][k] = (max_id_common, max_id_uncommon, max_id_rare, max_id) | ||
|
||
for gender in [0, 1]: | ||
for i in range(C_1900, C_2011 + 1): | ||
with open(os.path.join(output_folder, | ||
'dist.prenoms.{}.{}.txt'.format(gender_to_string(gender), | ||
year_to_string(i))), | ||
mode='w', encoding='utf-8') as f: | ||
t = max_ids[gender][i] | ||
f.write('{} {} {} {}\n'.format(t[0], t[1], t[2], t[3])) | ||
for name in distribs[gender][i]: | ||
f.write(name[0] + ',') | ||
with open(os.path.join(output_folder, | ||
'dist.prenoms.{}.all.txt'.format(gender_to_string(gender))), | ||
mode='w', encoding='utf-8') as f: | ||
t = max_ids[gender]['all'] | ||
f.write('{} {} {} {}\n'.format(t[0], t[1], t[2], t[3])) | ||
for name in distribs[gender]['all']: | ||
f.write(name[0] + ',') | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description='Parse INSEE Prenoms file') | ||
parser.add_argument('-i', '--input', dest='insee_file', | ||
help='Input Insee file') | ||
parser.add_argument('-o', '--output', dest='output_folder', | ||
help='Output folder for files') | ||
args = parser.parse_args() | ||
parse_insee(args.insee_file, args.output_folder) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
# INSEE Names format header is the following, separated by tabs | ||
# NOM _1891_1900 _1901_1910 _1911_1920 _1921_1930 _1931_1940 _1941_1950 _1951_1960 _1961_1970 | ||
# _1971_1980 _1981_1990 _1991_2000 | ||
|
||
import os.path | ||
import argparse | ||
|
||
C_1891 = 0 | ||
C_1901 = 1 | ||
C_1911 = 2 | ||
C_1921 = 3 | ||
C_1931 = 4 | ||
C_1941 = 5 | ||
C_1951 = 6 | ||
C_1961 = 7 | ||
C_1971 = 8 | ||
C_1981 = 9 | ||
C_1991 = 10 | ||
|
||
|
||
def year_to_string(i: int): | ||
if i == C_1891: | ||
return '1891' | ||
if i == C_1901: | ||
return '1901' | ||
if i == C_1911: | ||
return '1911' | ||
if i == C_1921: | ||
return '1921' | ||
if i == C_1931: | ||
return '1931' | ||
if i == C_1941: | ||
return '1941' | ||
if i == C_1951: | ||
return '1951' | ||
if i == C_1961: | ||
return '1961' | ||
if i == C_1971: | ||
return '1971' | ||
if i == C_1981: | ||
return '1981' | ||
if i == C_1991: | ||
return '1991' | ||
|
||
|
||
def parse_insee(insee_name_file: str, output_folder: str): | ||
distribs = {C_1891: [], C_1901: [], C_1911: [], C_1921: [], C_1931: [], C_1941: [], C_1951: [], | ||
C_1961: [], C_1971: [], C_1981: [], C_1991: [], 'all': []} | ||
max_ids = {C_1891: [], C_1901: [], C_1911: [], C_1921: [], C_1931: [], C_1941: [], C_1951: [], | ||
C_1961: [], C_1971: [], C_1981: [], C_1991: [], 'all': []} | ||
with open(insee_name_file, mode='r', encoding='utf-8') as f: | ||
first_line = True | ||
for line in f: | ||
if first_line: # Skipping the header | ||
first_line = False | ||
continue | ||
line = line.strip() | ||
infos = line.split('\t') | ||
if not infos: | ||
continue | ||
if infos[0] == "AUTRES NOMS": | ||
continue | ||
name = infos[0].title() | ||
total_count = 0 | ||
for i in range(C_1891, C_1991+1): | ||
count = int(infos[i+1]) | ||
total_count += count | ||
if count < 10: | ||
continue | ||
distribs[i].append((name, count)) | ||
distribs['all'].append((name, total_count)) | ||
|
||
for k in distribs.keys(): | ||
distribs[k].sort(key=lambda tup: tup[1], reverse=True) | ||
max_id_common = 0 | ||
max_id_uncommon = 0 | ||
max_id_rare = 0 | ||
max_id = len(distribs[k]) | ||
for i in range(0, len(distribs[k])): | ||
c = distribs[k][i][1] # Name occurence in the distrib | ||
if c < 500 and max_id_common == 0: | ||
max_id_common = i | ||
if c < 100 and max_id_uncommon == 0: | ||
max_id_uncommon = i | ||
if c < 50 and max_id_rare == 0: | ||
max_id_rare = i | ||
break | ||
max_ids[k] = (max_id_common, max_id_uncommon, max_id_rare, max_id) | ||
|
||
# Dump info | ||
for i in range(C_1891, C_1991 + 1): | ||
with open(os.path.join(output_folder, 'dist.noms.{}.txt'.format(year_to_string(i))), | ||
mode='w', encoding='utf-8') as f: | ||
t = max_ids[i] | ||
f.write('{} {} {} {}\n'.format(t[0], t[1], t[2], t[3])) | ||
for name in distribs[i]: | ||
f.write(name[0] + ',') | ||
with open(os.path.join(output_folder, 'dist.noms.all.txt'), | ||
mode='w', encoding='utf-8') as f: | ||
t = max_ids['all'] | ||
f.write('{} {} {} {}\n'.format(t[0], t[1], t[2], t[3])) | ||
for name in distribs['all']: | ||
f.write(name[0] + ',') | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description='Parse INSEE Names file') | ||
parser.add_argument('-i', '--input', dest='insee_file', | ||
help='Input Insee file') | ||
parser.add_argument('-o', '--output', dest='output_folder', | ||
help='Output folder for files') | ||
args = parser.parse_args() | ||
parse_insee(args.insee_file, args.output_folder) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,29 @@ | ||
import prenoms.utils | ||
from prenoms.utils import Gender, Originality | ||
from random import random | ||
|
||
__title__ = 'prenoms' | ||
__version__ = '0.1.8' | ||
__version__ = '0.5.1' | ||
__author__ = 'Cyril Novel' | ||
__license__ = 'MIT' | ||
|
||
|
||
def get_prenom(originality: float = 0.2): | ||
return prenoms.utils.get_name('first', originality) | ||
def get_prenom(originality: prenoms.utils.Originality = prenoms.utils.Originality.COMMON, | ||
gender: prenoms.utils.Gender = None, | ||
year: int = None): | ||
if gender is None: | ||
r = random() | ||
gender = prenoms.utils.Gender.MALE if r < 0.5 else prenoms.utils.Gender.FEMALE | ||
return prenoms.utils.get_name('first', originality, year, gender) | ||
|
||
|
||
def get_nom(originality: float = 0.2): | ||
return prenoms.utils.get_name('last', originality) | ||
def get_nom(originality: prenoms.utils.Originality = prenoms.utils.Originality.COMMON, | ||
year: int = None): | ||
return prenoms.utils.get_name('last', originality, year) | ||
|
||
|
||
def get_nom_complet(originality: float = 0.2): | ||
return '{} {}'.format(get_prenom(originality), get_nom(originality)).strip() | ||
def get_nom_complet(originality: prenoms.utils.Originality = prenoms.utils.Originality.COMMON, | ||
gender: prenoms.utils.Gender = None, | ||
year: int = None): | ||
return '{} {}'.format(get_prenom(originality, gender, year), | ||
get_nom(originality, year)).strip() |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.