Skip to content

Commit

Permalink
Create sorted glossaries in _data by lang
Browse files Browse the repository at this point in the history
  • Loading branch information
froggleston committed Sep 17, 2024
1 parent 9088323 commit 2b7c79f
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 33 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/yaml-lint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.8]
python-version: [3.11]

steps:
- uses: actions/checkout@v3
Expand All @@ -17,8 +17,9 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
apt-get install -y build-essential libicu-dev
python -m pip install --upgrade pip
pip install yamllint
pip install yamllint pycld2 pyicu-binary
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint _config.yml with yamllint
run: |
Expand Down
21 changes: 12 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,37 @@ all : commands
commands :
@grep -h -E '^##' ${MAKEFILE_LIST} | sed -e 's/## //g' | column -t -s ':'

_data/glossary.yml : ./glossary.yml
@mkdir -p _data
@cp $< $@

sort-glossary : _data/glossary.yml
@yamllint glossary.yml
@python utils/sort-glossary.py

## site : rebuild GitHub Pages site locally.
site : _data/glossary.yml
site : sort-glossary
rm -rf .jekyll-cache .jekyll-metadata _site
bundle exec jekyll build

## gh-site : builds the website for GitHub pages (part of the GH Actions workflow)
gh-site : _data/glossary.yml
gh-site : sort-glossary
@rm -rf _gh-site
@mkdir -p _gh-site
@cp -r `ls -A | grep -v '.git' | grep -v '_gh-site' | grep -v '_site'` _gh-site
@mkdir -p _gh-site/_data
@cp $< _gh-site/$<

## serve : serve GitHub Pages site locally.
serve : _data/glossary.yml
serve : sort-glossary
rm -rf _site
bundle exec jekyll serve -I

## clean : clean up unneeded files.
clean :
@rm -rf _site
@find . -name '*~' -exec rm {} \;
@rm -f _data/glossary.yml
@rm -rf _data/*

## check : check glossary consistency.
check :
Expand All @@ -37,8 +45,3 @@ check :
## checkall : check glossary consistency including missing terms in all languages.
checkall :
@python utils/check-glossary.py -A _config.yml glossary.yml

# Create copy of glossary file for GitHub Pages site.
_data/glossary.yml : ./glossary.yml
@mkdir -p _data
@cp $< $@
71 changes: 49 additions & 22 deletions utils/sort-glossary.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

import icu

languages = [
# set up supported languages
LANGUAGES = [
('aa', 'Afar'),
('ab', 'Abkhazian'),
('af', 'Afrikaans'),
Expand Down Expand Up @@ -212,13 +213,15 @@
('zu', 'Zulu')
]

def _sort_terms(count_dict):

def _sort_terms(count_dict, data_path):
# sort and reassign terms
for lang in count_dict:
# check 2-letter language codes vs 3-letter language codes
# std_lang = standardize_tag(lang)
# print(f"{lang} -> {std_lang} -> {Language.get(std_lang).to_alpha3()}")

# create a locale from the language code and sort the terms with a collator
# create a locale from the language code and a collator to perform sorting
icu_locale = icu.Locale(lang)
collator = icu.Collator.createInstance(icu_locale)

Expand All @@ -227,16 +230,18 @@ def _sort_terms(count_dict):
lang_path = data_path.joinpath(lang)
lang_path.mkdir(parents=True, exist_ok=True)

# sort
# sort and store sorted terms separate from the original list
sorted_terms = sorted(count_dict[lang]["terms"], key=collator.getSortKey)
count_dict[lang]["sorted_terms"] = sorted_terms
return count_dict

def _setup_dict(glossary):

def _setup_dict(glossary, data_path):
# data structure to hold counts and terms
count_dict = {}
lang_codes = []

for cc in languages:
lang_codes = []
for cc in LANGUAGES:
count_dict[cc[0]] = {}
count_dict[cc[0]]["count"] = 0
count_dict[cc[0]]["name"] = cc[1]
Expand All @@ -248,6 +253,7 @@ def _setup_dict(glossary):
# total number of glossary terms
# print(len(glos))

# count terms and store them in the data structure
for slug in glossary:
for lang in slug.keys():
if lang in lang_codes:
Expand All @@ -259,50 +265,71 @@ def _setup_dict(glossary):
"def": slug[lang]["def"]
}
)
return _sort_terms(count_dict)

# return the data structure including sorted terms
return _sort_terms(count_dict, data_path)


def _build_lang_glossary(count_dict):
glossary_by_lang = {}
for lang in count_dict:
sorted_glossary = []

# process the data structure to create a new sorted glossary per language
for sorted_term in count_dict[lang]["sorted_terms"]:
if sorted_term in count_dict[lang]["term_entry_map"]:
term_map = count_dict[lang]["term_entry_map"][sorted_term]
slug = term_map["slug"]
_def = term_map["def"]

# use an OrderedDict to retain insertion order
sorted_glossary.append(OrderedDict({
"slug": slug,
lang: {
"term": sorted_term,
"def": _def
}
}))

# only include languages with terms
if sorted_glossary:
glossary_by_lang[lang] = sorted_glossary
return glossary_by_lang


def setup_yaml():
""" https://stackoverflow.com/a/8661021 """
def represent_dict_order(self, data):
return self.represent_mapping('tag:yaml.org,2002:map', data.items())
yaml.add_representer(OrderedDict, represent_dict_order)

# load main glossary file
glos = yaml.safe_load(Path('glossary.yml').read_text())
data_path = Path("_data/")

# sort terms
count_dict = _setup_dict(glos)
def main():
try:
# get path
current_path = Path(__file__).resolve()

# load main glossary file
data_path = current_path.parent.parent.joinpath("_data/")
glossary_path = data_path.joinpath("glossary.yml")
glos = yaml.safe_load(glossary_path.read_text())

# sort terms
sort_dict = _setup_dict(glos, data_path)

# rebuild glossary per language
sorted_glossary_by_lang = _build_lang_glossary(sort_dict)

# setup yaml for outputting
setup_yaml()
for lang in sorted_glossary_by_lang:
yaml.dump(sorted_glossary_by_lang[lang], Path(f'_data/{lang}/glossary.yml').open('w'))

# rebuild glossary per language
sorted_glossary_by_lang = _build_lang_glossary(count_dict)
# output counts
# pprint.pprint(count_dict)
except Exception as e:
print(e)

# setup yaml for outputting
setup_yaml()
for lang in sorted_glossary_by_lang:
pprint.pprint(sorted_glossary_by_lang[lang])
yaml.dump(sorted_glossary_by_lang[lang], Path(f'_data/{lang}/glossary.yml').open('w'))

# output counts
# pprint.pprint(count_dict)
if __name__ == '__main__':
main()

0 comments on commit 2b7c79f

Please sign in to comment.