Edgewall Software

Version 1 (modified by David Fraser <davidf@…>, 17 years ago) (diff)

Initial posting of module

This is code to aid in localization of Genshi templates, without altering the underlying templates. It was originally written by Matt Good, then updated and fixed up by David Fraser.

Firstly here is a module that can be used to extract text from Genshi template streams.

import fnmatch
import os
import re
import logging
import copy

import genshi.core
import genshi.input
import genshi.eval
import genshi.template

ignore_tags = ['script', 'style']
include_attribs = ['title', 'alt', 'longdesc']
exclude_dirs = ('.AppleDouble', '.svn', 'CVS', '_darcs')
gettext_re = re.compile(r"_\(((?:'[^']*')|(?:\"[^\"]*\"))\)")

# calculate escapes
escapes = []

def make_escapes(pass_iso8859):
    global escapes
    if pass_iso8859:
        # Allow iso-8859 characters to pass through so that e.g. 'msgid
        # "Höhe"' would result not result in 'msgid "H\366he"'.  Otherwise we
        # escape any character outside the 32..126 range.
        mod = 128
    else:
        mod = 256
    for i in range(256):
        if 32 <= (i % mod) <= 126:
            escapes.append(chr(i))
        else:
            escapes.append("\\%03o" % i)
    escapes[ord('\\')] = '\\\\'
    escapes[ord('\t')] = '\\t'
    escapes[ord('\r')] = '\\r'
    escapes[ord('\n')] = '\\n'
    escapes[ord('\"')] = '\\"'

make_escapes(False)

def escape(s):
    global escapes
    s = list(s)
    for i in range(len(s)):
        s[i] = escapes[ord(s[i])]
    return EMPTYSTRING.join(s)

def normalize(s):
    """This converts the various Python string types into a format that is
    appropriate for .po files, namely much closer to C style."""
    lines = s.split('\n')
    if len(lines) == 1:
        s = '"' + escape(s) + '"'
    else:
        if not lines[-1]:
            del lines[-1]
            lines[-1] = lines[-1] + '\n'
        for i in range(len(lines)):
            lines[i] = escape(lines[i])
        lineterm = '\\n"\n"'
        s = '""\n"' + lineterm.join(lines) + '"'
    return s

def lang_extract(potfile, source_files, template_class=None):
    """extracts text strings from the given source files and outputs them at the end of the given pot file"""
    fd = open(potfile, 'at+')
    try:
        keys_found = {}
        key_order = []
        for fname, linenum, key in extract_keys(source_files, ['.'], template_class):
            if key in keys_found:
                keys_found[key].append((fname, linenum))
            else:
                keys_found[key] = [(fname, linenum)]
                key_order.append(key)
        for key in key_order:
            for fname, linenum in keys_found[key]:
                fd.write('#: %s:%s\n' % (fname, linenum))
            fd.write('msgid %s\n' % normalize(key))
            fd.write('msgstr ""\n\n')
    finally:
        fd.close()

def _matching_files(dirname, fileglob):
    """searches for matching filenames in a directory"""
    for root, dirs, files in os.walk(dirname):
        for exclude in exclude_dirs:
            try:
                dirs.remove(exclude)
            except ValueError:
                pass
        for fname in fnmatch.filter(files, fileglob):
            yield os.path.join(root, fname)

def extract_keys(files, search_path=None, template_class=None):
    """finds all the text keys in the given files"""
    loader = genshi.template.TemplateLoader(search_path)
    for fname in files:
        logging.info('Scanning l10n keys from: %s' % fname)
        try:
            if template_class is None:
                template = loader.load(fname)
            else:
                template = loader.load(fname, cls=template_class)
        except genshi.input.ParseError, e:
            logging.warning('Skipping extracting l10n keys from %s: %s' % (fname, e))
            continue
        for linenum, key in extract_from_template(template):
            yield fname, linenum, key

def extract_from_template(template, search_text=True):
    """helper to extract linenumber and key pairs from a given template"""
    return extract_from_stream(template.stream, search_text)

def extract_from_stream(stream, search_text=True):
    """takes a MatchTemplate.stream (not a normal XML Stream) and searches for localizable text, yielding linenumber, text tuples"""
    # search_text is set to false when extracting from substreams (that are attribute values for an attribute which is not text)
    # in this case, only Python strings in expressions are extracted
    stream = iter(stream)
    tagname = None
    skip_level = 0
    for kind, data, pos in stream:
        linenum = pos[1]
        print kind, linenum
        if skip_level:
            if kind is genshi.core.START:
                tag, attrs = data
                if tag.localname in ignore_tags:
                    skip_level += 1
            if kind is genshi.core.END:
                tag = data
                if tag.localname in ignore_tags:
                    skip_level -= 1
            continue
        if kind is genshi.core.START:
            tag, attrs = data
            tagname = tag.localname
            if tagname in ignore_tags:
                # skip the substream
                skip_level += 1
                continue
            for name, value in attrs:
                if isinstance(value, basestring):
                   if search_text and name in include_attribs:
                       yield linenum, value
                else:
                    for dummy, key in extract_from_stream(value,
                                                      name in include_attribs):
                        yield linenum, key
        elif kind is genshi.template.EXPR:
            if data.source != "?":
                # TODO: check if these expressions should be localized
                for key in gettext_re.findall(data.source):
                    key = key[1:-1]
                    if key:
                        yield linenum, key
        elif kind is genshi.core.TEXT and search_text:
            key = data.strip()
            if key:
                yield linenum, key
        elif kind is genshi.template.SUB:
            sub_kind, sub_stream = data
            for linenum, key in extract_from_stream(sub_stream, search_text):
                yield linenum, key

The following function can then be used to localize the template stream (see below for details on use):

def localize_template(template_source_stream, ugettext, search_text=True):
    """localizes the given template source stream (i.e. genshi.XML(template_source), not the parsed template's stream
    need to pass in the ugettext function you want to use"""
    # NOTE: this MUST NOT modify the underlying objects or template reuse will break
    # in addition, if it calls itself recursively it must convert the result to a list or it will break on repetition
    # search_text is set to false when extracting from substreams (that are attribute values for an attribute which is not text)
    # in this case, only Python strings in expressions are extracted
    stream = iter(template_source_stream)
    skip_level = 0
    for kind, data, pos in stream:
        # handle skipping whole chunks we don't want to localize (just yielding everything in them)
        if skip_level:
            if kind is genshi.core.START:
                tag, attrs = data
                tag = tag.localname
                if tag in ignore_tags:
                    skip_level += 1
            if kind is genshi.core.END:
                tag = data.localname
                if tag in ignore_tags:
                    skip_level -= 1
            yield kind, data, pos
            continue
        # handle different kinds of things we want to localize
        if kind is genshi.core.START:
            tag, attrs = data
            tagname = tag.localname
            if tagname in ignore_tags:
                skip_level += 1
                yield kind, data, pos
                continue
            new_attrs = genshi.core.Attrs(attrs[:])
            changed = False
            for name, value in attrs:
                if isinstance(value, basestring):
                   if search_text and name in include_attribs:
                       new_value = ugettext(search_text)
                       new_attrs.set(name, new_value)
                       changed = True
                else:
                    # this seems to be handling substreams, so we should get back a localized substream
                    # note: passing search_text=False implies far fewer matches, this may be wasteful and the subcall could be skipped in some cases
                    new_value = list(localize_template(value, ugettext, search_text=(name in include_attribs)))
                    new_attrs.set(name, new_value)
                    changed = True
            if changed:
                # ensure we don't change the original string
                attrs = new_attrs
            yield kind, (tag, attrs), pos
        elif kind is genshi.template.EXPR:
            if data.source != "?":
                # TODO: check if these expressions should be localized
                for key in gettext_re.findall(data.source):
                    key = key[1:-1]
                    if key:
                        new_key = ugettext(key)
                        # TODO: if we do this, it needs to be fixed :-)
                        new_data = genshi.eval.Expression(data.source.replace(key, new_key))
                        # we lose the following data, but can't assign as its readonly
                        # new_data.code.co_filename = data.code.co_filename
                        # new_data.code.co_firstlineno = data.code.co_firstlineno
            yield kind, data, pos
        elif kind is genshi.core.TEXT and search_text:
            # we can adjust this as strings are immutable, so this won't change the original string
            key = data.strip()
            if key:
                new_key = ugettext(key)
                data = data.replace(key, new_key)
            yield kind, data, pos
        elif kind is genshi.template.SUB:
            sub_kind, sub_stream = data
            new_sub_stream = list(localize_template(sub_stream, ugettext, search_text=search_text))
            yield kind, (sub_kind, new_sub_stream), pos
        else:
            yield kind, data, pos