Source code for wdp.format

from typing import List, Tuple, Dict, Any
import re
from wdp.models import Word
from jinja2 import Template

# TODO: add IPA handling e.g. {{IPA|en|foo|bar}}
from wdp.validate import validate_word

ENTRY_TEMPLATE = Template(
    """=={{lang_name}}==

{% for word in words %}
    {% if words|length > 1 %}
        {{ section(2, "Etymology " ~ loop.index) }}
    {% endif %}

    {% if word['alternative_forms'] %}
        {{ section(3, "Alternative forms") }}
        {% for form in word['alternative_forms'] %}
            * {{LL}}alter|{{lang_code}}|{{form.alternative_form}}||{{form.description_of_use}}{{RR}}
        {% endfor %}
    {% endif %}
    
    {% if word['description'] %}
      {{ section(3, "Description") }}
      {{ word['description'] }}
    {% endif %}
    
    {% if word['etymology'] %}
      {{ section(3, "Etymology") }}
      {{ word['etymology'] }}
    {% endif %}

    {% if word['pronunciations'] %}
        {{ section(3, "Pronunciation") }}
        {% for pronunciation in word['pronunciations'] %}
            {% if pronunciation.notation|lower == "ipa" %}
                * {% if pronunciation.accent %}{{LL}}a|{{pronunciation.accent}}{{RR}} {% endif %}{{LL}}IPA|{{lang_code}}|{{pronunciation.pronunciation}}{{RR}}
            {% else %}
                * {% if pronunciation.accent %}{{LL}}a|{{pronunciation.accent}}{{RR}} {% endif %}{{pronunciation.pronunciation}}
            {% endif %}
        {% endfor %}
    {% endif %}

    {% for pos in word["grouped_definitions"] %}
        {{ section(3, pos.capitalize()) }}
        {{LL}}head|{{lang_code}}|{{pos}}{{RR}}
        
        {% for definition in word["grouped_definitions"][pos] %} 
            # {{definition.definition}}
            {% for usage_example in definition.usage_examples %}
            #: {{LL}}uxi|{{lang_code}}|{{usage_example.text}}|{{usage_example.translation}}{{RR}}
            {% endfor %}
        {% endfor %}
    {% endfor %}
    
    {% if word['usage_notes'] %}
      {{ section(3, "Usage notes") }}
      {{ word['usage_notes'] }}
    {% endif %}
    
    {% if word['conjugation'] %}
      {{ section(3, "Conjugation") }}
      {{ word['conjugation'] }}
    {% endif %}
    
    {% if word['declension'] %}
      {{ section(3, "Declension") }}
      {{ word['declension'] }}
    {% endif %}
    
    {% if word['inflection'] %}
      {{ section(3, "Inflection") }}
      {{ word['inflection'] }}
    {% endif %}
    
    {% if word['references'] %}
      {{ section(3, "References") }}
      {{ word['references'] }}
    {% endif %}
{% endfor %}
"""
)


[docs]def group_definitions_by_pos(context: dict):
    """
    Return a dict mapping from part of speech to a list of all the definitions with that part of speech.
    """
    definitions = context["definitions"]
    parts_of_speech = set(definition["part_of_speech"] for definition in definitions)

    return {pos.lower(): [d for d in definitions if d["part_of_speech"] == pos] for pos in parts_of_speech}


[docs]def format_entry(word_group: List[Word], lang_code: str, lang_name: str) -> Tuple[str, str]:
    """
    Turn a list of Word objects into Wikitext.
    """
    # the dict we will use to render the jinja template
    context: Dict[str, Any] = dict(
        lang_code=lang_code,
        lang_name=lang_name,
        words=[],
    )

    # iterate over Word objects
    for word in word_group:
        word_context = word.to_dict()
        # a Word will in general have many definitions with different parts of speech--separate them
        word_context["grouped_definitions"] = group_definitions_by_pos(word_context)
        context["words"].append(word_context)

    def section(depth, content):
        """
        Formats string according to whether there is more than one word group or not.
        Args:
            depth:   the level the section has in an entry with a single etymology
            content: a string to be displayed in a header

        Returns: Formatted section header
        """
        c = len(word_group) > 1
        s = "=" * (depth + c)
        return s + content + s

    output = ENTRY_TEMPLATE.render(section=section, LL="{{", RR="}}", **context)
    # undo formatting that made the jinja template easier to read
    output = "\n".join(re.sub(r"^ +", "", line) for line in output.split("\n"))
    output = re.sub(r"\n\n+#", "\n#", output)
    output = re.sub(r"=\n+=", "=\n=", output)
    output = re.sub(r"\n\n+", "\n\n", output)
    output = re.sub(r"===\n\n", "===\n", output)
    output = re.sub(r"=\n=", "=\n\n=", output) # tyography: an empty section should have extra newline
    output = re.sub(r"({{head\|[^\n]*})\n#", r"\1\n\n#", output) # headword template should have extra newline after
    return word_group[0].word_form, output


[docs]def group_words(words: List[Word]) -> List[List[Word]]:
    """ Group words based on their word_form attribute """
    word_forms = set(w.word_form for w in words)
    return [[w for w in words if w.word_form == word_form] for word_form in word_forms]


[docs]def format_entries(words: List[Word], lang_code: str, lang_name: str) -> List[Tuple[str, str]]:
    for word in words:
        validate_word(word)
    grouped_words = group_words(words)
    formatted_entries = [format_entry(word_group, lang_code, lang_name) for word_group in grouped_words]
    return formatted_entries