Ugrás a tartalomhoz

Szerkesztő:Tgr/huflex

A Wikipédiából, a szabad enciklopédiából
-- inflection and other language processing functions for Hungarian
local Huflex = {}
local sf, utils = require('Module:StringFunctions'), require('Module:Utils')

local vowels = {}
vowels.back = Set {'a', 'á', 'o', 'ó', 'u', 'ú'}
vowels.frontIllabial = Set {'e', 'é', 'i', 'í'}
vowels.frontLabial = Set {'ö', 'ő', 'ü', 'ű'}
vowels.front = vowels.frontIllabial + vowels.frontLabial
vowels.all = vowels.front + vowels.back
vowels.shortToLong = {['a'] = 'á', ['e'] = 'é', ['i'] = 'í', ['o'] = 'ó', ['ö'] = 'ő', ['u'] = 'ú', ['ü'] = 'ű'}

local digraphs = {}
digraphs.short = Set {'cs', 'dz', 'gy', 'ly', 'ny', 'sz', 'zs'}
digraphs.long = Set {'ccs', 'ddz', 'ggy', 'lly', 'nny', 'ssz', 'zzs'}
function digraphs.shortToLong(digraph)
    return digraph:sub(1, 1) .. digraph
end

function Huflex.vowelHarmony(word)
    -- returns two values: vowel harmony ('front', 'back' or 'mixed') and last vowel
    if word:sub(-4) == 'wiki' then
        -- the last part of a composite word determines its vowel harmony; there is no way to handle this generally,
        -- but sitenames ending with 'wiki' are frequent enough to handle as a special case
        return 'front', 'i'
    end
    
    local vowelHarmonyBack, vowelHarmonyFront = false, false
    for c in sf.split(word) do
        if c == ' ' or c == '-' then -- only the last word counts; start again
            vowelHarmonyBack, vowelHarmonyFront = false, false
        elseif vowels.all:has(c) then 
            lastVowel = c
            if vowels.back:has(c) then
                vowelHarmonyBack = true
            else
                vowelHarmonyFront = true
            end
        end
    end
    if vowelHarmonyFront and vowelHarmonyBack then
        vowelHarmony = 'mixed'
    elseif vowelHarmonyFront then
        vowelHarmony = 'front'
    else
        vowelHarmony = 'back'
    end
    return vowelHarmony, lastVowel
end

-- Selects the matching suffix based on vowel harminy
function Huflex.selectSuffix(vowelHarmony, lastVowel, suffix1, suffix2, suffix3)
    if suffix2 == nil then -- trivial case, suffix has only one form
        return suffix1
    elseif vowelHarmony == 'back' then
        return suffix1
    elseif vowelHarmony == 'front' then
        if suffix3 ~= nil and vowels.frontLabial:has(lastVowel) then
            return suffix3
        else
            return suffix2
        end
    elseif vowelHarmony == 'mixed' then
        if vowelsBack:has(lastVowel) or vowels.frontIllabial:has(lastVowel) then
            return suffix1
        else -- vowelsFrontLabial:has(lastVowel)
            if suffix3 ~= nil then
                return suffix3
            else
                return suffix2
            end
        end
    else
        fail() -- invalid value for vowelHarmony
    end
end

-- Combines word with suffix according to Hungarian grammar (vowel harmony + assimilation). Far from perfect, but.
-- should work with the suffixes following {{SITENAME}} in the interface messages, unless sitename is some tricky.
-- composite or foreign word.
-- addStuff does three things:
-- 1) select the suffix with matching vowel harmony (first parameter should be back, second front,
--    third rounded (labial); second and third might be omitted if the suffix has less forms).
-- 2) if the last letter of the word is 'a', 'e' or 'o', change it to 'á', 'é' or 'ó' respectively.
-- 3) if the first letter of the suffix is 'v', change it according to assimilation rules. (This can get
--    complicated if the last letter of the word is a digraph/trigraph or a double consonant.)
-- TODO http://www.szabogabor.net/2011/09/27/val-vel-rag-generalasa-szohoz/
function Huflex.addSuffix(frame)
    local word, suffix1, suffix2, suffix3 = unpackFrame(frame, 1, 2, 3, 4)
    word, suffix1, suffix2, suffix3 = unpack(map(sf.trim, {word, suffix1, suffix2, suffix3}))
    local vowelHarmony, lastVowel = Huflex.vowelHarmony(word)
    local suffix = Huflex.selectSuffix(vowelHarmony, lastVowel, suffix1, suffix2, suffix3)
    
    -- change end of word: a -> á, e -> é, o -> o
    if Set({'a', 'e', 'o'}):has(sf.sub(word, -1)) then
        word = sf.sub(word, 1, -2) .. vowels.shortToLong[sf.sub(word, -1)]
    end
    
    -- change start of suffix: v assimilates if the word ends with a consonant
    if sf.sub(suffix, 1, 1) == 'v' and not vowels.all:has(sf.sub(word, -1)) then
        if sf.sub(word, -2, -2) == sf.sub(word, -1) or digraphs.long:has(sf.sub(word, -3)) then -- long consonant, does not get any longer
            suffix = sf.sub(suffix, 2)
        elseif digraphs.short:has(sf.sub(word, -2)) then -- ends with short digraph
            suffix = sf.sub(suffix, 2)
            word = sf.sub(word, 1, -3) .. digraphs.shortToLong(sf.sub(word, -2))
        else -- single character, will become double now
            suffix = sf.sub(word, -1) .. sf.sub(suffix, 2)
        end
    -- leave out first character of the suffix if its a vowel and the word also ends with a vowel
    elseif vowels.all:has(sf.sub(word, -1)) and vowels.all:has(sf.sub(suffix, 1, 1)) then
        suffix = sf.sub(suffix, 2)
    end
    
    return word .. suffix
--]]--
end

return Huflex

Utils:

-- utility functions (not namespaced!)

-- maps all elements of an array through a function
function map(func, array)
  local new_array = {}
  for k,v in pairs(array) do
    new_array[k] = func(v)
  end
  return new_array
end

-- turns a frame object into a table
function unpackFrame(frame, ...)
    local arguments = {}
    for _, v in ipairs(arg) do
        arguments[v] = frame.args[v]
    end
	return unpack(arguments)
end

-- returns a new table which is the reverse of the original
function table.reverse(tbl)
    local newTable = {}
    for k,v in pairs(tbl) do
        newTable[v] = k
    end
    return newTable
end

-- returns a new table created by merging two tables
-- the second table overwrites the first for equal keys
function table.merge(tbl1, tbl2)
    local newTable = {}
    for k,v in pairs(tbl1) do
        newTable[k] = v
    end    
    for k,v in pairs(tbl2) do
        newTable[k] = v
    end
    return newTable
end

function table.dump(tbl, recursive)
    if type(tbl) ~= 'table' then
        return '<not a table!>'
    end
    local s = '{ '
    for k,v in pairs(tbl) do
        if type(k) ~= 'number' then 
            k = '"'..k..'"' 
        end
        if type(v) == 'table' then
            if recursive then
                v = table.dump(v, true)
            else
                v = 'table'
            end
        elseif type(v) == 'boolean' then -- no boolean -> string autoconversion in Lua
            if v then
                v = 'true'
            else
                v = 'false'
            end
        end
        s = s .. '['..k..'] = ' .. v .. ','
    end
    return s .. '} '
end

SetMeta = {
    __index = {
        has = function(tbl, key)
            return tbl[key] ~= nil
        end
    },
    __add = function(set1, set2)
        local set = table.merge(set1, set2)
        setmetatable(set, SetMeta)
        return set
    end,
}

function Set(list)
    local set = {}
    setmetatable(set, SetMeta)
    for _, l in ipairs(list) do 
        set[l] = true
    end
    return set
end

local Test = {
    testMerge = function(frame)
        local a,b,c,d,e,f = unpackFrame(frame, 1, 2, 3, 4, 5, 6)
        local s1, s2, s3 = Set{a,b,c}, Set{d,e,f}, Set{a,b,e,f}
        return table.dump(s1 + s2) .. '|' .. table.dump(s1 + s3)
    end
}
return Test

StringFunctions:

-- basic string manipulation functions
local StringFunctions = {}

function StringFunctions.trim(str)
    if str.args then
        str = str.args[1]
    end
    if str == nil then
        return nil
    else
        return (str:gsub("^%s*(.-)%s*$", "%1")) -- extra brackets are necessary because gsub returns multiple values
    end
end

-- splits a string into (Unicode) characters
-- returns an iterator
-- behavior is undefined for input which is invalid UTF-8
function StringFunctions.split(str)
    local i, error = 1, false
    return function()
        if error then
            return nil
        end
        local byte = str:byte(i)
        if byte == nil then
            return nil
        end
        
        -- determine number of 1 bits before the first 0 in byte
        local leadBits, bitValue, remainder = 0, 128, byte
        while bitValue <= remainder and bitValue > 1 do
            leadBits = leadBits + 1
            remainder = remainder - bitValue
            bitValue = bitValue / 2
        end
        
        local chr, length -- the next UTF-8 character and its length in bytes
        if leadBits == 0 then -- ASCII character
            length = 1
        elseif leadBits == 1 or leadBits > 6 then -- not valid UTF-8
            error = true
            return '<error(' + i + ':' + byte + ')>'
        else
            length = leadBits
        end
        chr = str:sub(i, i + length - 1)
        i = i + length
        return chr
    end
end

-- UTF-8 aware version of string:len
function StringFunctions.len(str)
    local i = 0
    for c in StringFunctions.split(str) do
        i = i + 1
    end
    return i
end

-- UTF-8 aware version of string:sub
function StringFunctions.sub(str, i, j)
    if i < 0 then
        i = StringFunctions.len(str) + i + 1
    end
    if j and j < 0 then
        j = StringFunctions.len(str) + j + 1
    end
    
    local pos, substr = 0, ''
    for c in StringFunctions.split(str) do
        pos = pos + 1
        if pos >= i and (not j or pos <= j) then
            substr = substr .. c
        end
    end
    return substr
end

return StringFunctions