simple jsgf permutator subflow

About

This subflow can parse all possible sentences / permutations from a jsgf grammar file and output them as an array of sentences. This can for example be useful to quickly generate a text corpus to train a language model for a speech recognition system like Deepspeech or Kaldi. It can also be used to rig a simple nlu / intent recognition system together with node-red-contrib-fuzzywuzzy. This is done by adding tags to the jsgf grammar and than choosing move tags to the left in the subflow menu. This subflow only uses function nodes and has zero dependencies.

Usage

To create all possible sentences pass in the jsgf file content as a string in the msg.payload. The subflow will than output all permutations of the public rule sentences described by the grammar as an array of sentences in the msg.payload. It can also be configured to output the sentences as a single string with one sentence per line. This output will be to the first output of the subflow.

Weights

The subflow does support a simplified version of weights. This is opt in. If you dont want weights to be ignored select the check mark for use weights in the menu. In difference to the full jsgf standard this subflow due to its deterministic nature only supports whole numbers as weights. A possible sentence will appear once in the result with either no weight or a weight of 1. The weight will work as a multiplication factor if bigger than 1. So an alternative with a weight of 5 for example will appear 5 times in the result. This can be used to fine tune a corpus and its likelyhoods for a language model for example.

Restrictions

Right now the subflow only supports a subset of the jsgf syntax. It supports:

optionals
alternatives and whole number weights
public & private rules
single line comments
tags
multiline rules

It doesn‘t support:

recursive rules
weights that are not whole numbers
imports
Kleene Star(*) or Plus(+) operators
multi line comments

There may be other features missing as this is still work in progress.

Things to look out for

always wrap alternatives in parenthese
surround tags with spaces towards surrounding elements
end all rules with a semicolon (;) for correct parsing
dont include infinite loops (recursions) in your grammar
be aware that every unique sentence will only ever be output once even if it appears multiple times in the grammar
processing of large and/or complex grammars can take anywhere from seconds to minutes

Note: some third-party nodes may appear with blank styling, and not as they appear in the Node-RED Editor.

[{"id":"a71fc6a3.13964","type":"subflow","name":"jsgf permutator","info":"","category":"","in":[{"x":60,"y":120,"wires":[{"id":"64935c66.d7f52c"},{"id":"a981f80e.432d8"}]}],"out":[{"x":1200,"y":120,"wires":[{"id":"4d6f9aa6.a03db4","port":0}]},{"x":1260,"y":60,"wires":[{"id":"9d1669a5.171c18","port":0}]}],"env":[{"name":"tags","type":"str","value":"remove","ui":{"icon":"font-awesome/fa-bookmark-o","label":{"en-US":"Tags"},"type":"select","opts":{"opts":[{"l":{"en-US":"remove"},"v":"remove"},{"l":{"en-US":"leave"},"v":"leave"},{"l":{"en-US":"move to left"},"v":"left"},{"l":{"en-US":"split: remove/left"},"v":"split"}]}}},{"name":"outputType","type":"str","value":"array","ui":{"label":{"en-US":"output as"},"type":"select","opts":{"opts":[{"l":{"en-US":"Array"},"v":"array"},{"l":{"en-US":"String"},"v":"string"}]}}},{"name":"weights","type":"bool","value":"false","ui":{"label":{"en-US":"use weights"},"type":"checkbox"}}],"color":"#E7E7AE","icon":"node-red/split.svg","status":{"x":1160,"y":220,"wires":[{"id":"a981f80e.432d8","port":0},{"id":"be781da.d15a96","port":0}]}},{"id":"64935c66.d7f52c","type":"function","z":"a71fc6a3.13964","name":"ruleTokenizer","func":"const weights = env.get(\"weights\");\nfunction ruleTokenizer (rule) {\n    let ruleArrRaw = rule.split(\"\");\n    let ruleArr = [];\n    let outputToken = \"\";\n    ruleArrRaw.forEach(token => {\n        if (token.match(/[a-zA-Z0-9\\ä\\ö\\ü\\ß\\-\\_\\{\\}\\<\\>\\/]/) !== null) {\n            outputToken += token;\n        } else if (token.match(/[\\[\\]\\|\\(\\)]/) !== null) {\n            if (outputToken.length !== 0) {\n                ruleArr.push(outputToken);\n                outputToken = \"\";\n            }\n            ruleArr.push(token);\n        } else if (token.match(/\\s/) !== null) {\n            if (outputToken.length !== 0) {\n                ruleArr.push(outputToken);\n                outputToken = \"\";\n            }\n        }\n    });\n    if (outputToken.length !== 0) {\n        ruleArr.push(outputToken);\n    }\n    return ruleArr;\n}\nlet rawRules = msg.payload.replace(/\\/\\/(.*)\\r?\\n|\\r/g, \" \").replace(/\\r?\\n|\\r/g, \" \").replace(/\\s{2,}/g, \" \");\nif (!weights) {\n    rawRules = rawRules.replace(/\\/[0-9]?[\\.0-9]*\\//g, \"\");\n}\nrawRules = rawRules.split(\";\").map(rule => rule.trim()).filter(rule => rule.length > 0);\nlet ruleName = rawRules.filter(rule => rule.match(/^grammar\\s/g))[0].replace(/^grammar\\s/g, \"\");\n(ruleName) ? flow.set(\"ruleName\", ruleName) : flow.set(\"ruleName\", false);\nrawRules = rawRules.filter(rule => rule.match(/\\=/g));\nlet rulesObj = {};\nrawRules.forEach(rule => {\n    rule = rule.split(\"=\").map(rule => rule.trim());\n    let isPublic =(rule[0].match(/^public/g) !== null) ? true : false;\n    let name = rule[0].match(/\\<(.+)\\>/g).toString().replace(/\\<|\\>/g, \"\");\n    rulesObj[name] = {\n        rule: ruleTokenizer(rule[1]),\n        public: isPublic\n    }\n});\nmsg.payload = rulesObj;\nreturn msg;","outputs":1,"noerr":0,"initialize":"","finalize":"","x":250,"y":120,"wires":[["6ba0d6a0.b6ada"]]},{"id":"77f7190d.b0eef","type":"function","z":"a71fc6a3.13964","name":"ruleOptionals","func":"let finalResults = [];\nlet iterateOptionalsRule = 0;\nlet toDo = [];\nlet iterations = 998;\nfunction optionalsCreator (tokens) {\n    if (optionalsChecker(tokens)) {\n        let check = tokens.indexOf(\"[\");\n        let level = 1;\n        let withIt = [];\n        let withoutIt = [];\n        let results = [];\n        for (i = check+1; i < tokens.length; i++) {\n            if (tokens[i] === \"]\") {\n                level -= 1;\n            } else if (tokens[i] === \"[\") {\n                level += 1;\n            }\n            if (level === 0) {\n                withIt = tokens.filter((token,index) => index !== check && index !== i);\n                withoutIt = tokens.filter((token,index) => index < check || index > i);\n                break;\n            }\n        }\n        results.push(withIt, withoutIt);\n        results.forEach(result => {\n            if (optionalsChecker(result)) {\n                toDo.push(result);\n            } else {\n                finalResults.push(result);\n            }\n        });\n        if (toDo.length === 0) {\n            if (iterateOptionalsRule < optionalsIterationLength - 1) {\n                iterateOptionalsRule += 1;\n                toDo.push(ruleArr[iterateOptionalsRule]);\n                nextOne();\n            } else {\n                node.send({payload:finalResults});\n            }\n        } else {\n            nextOne();\n        }\n    } else {\n        finalResults.push(tokens);\n        if (iterateOptionalsRule < optionalsIterationLength - 1) {\n            iterateOptionalsRule += 1;\n            toDo.push(ruleArr[iterateOptionalsRule]);\n            nextOne();\n        } else {\n            node.send({payload:finalResults});\n        }\n    }\n    return;\n}\nfunction optionalsChecker (inputArray) {\n    if (inputArray.includes(\"[\") && inputArray.includes(\"]\")) {\n        return true;\n    }\n    return false;\n}\nfunction nextOne () {\n    iterations += 1;\n    let next = toDo.shift();\n    if (Array.isArray(next)) {\n        if (iterations >= 1000) {\n            iterations = 0;\n            setTimeout(()=>{\n                optionalsCreator(next);\n            },0);\n        } else {\n            optionalsCreator(next);\n        }\n    }\n    return;\n}\nlet ruleArr = msg.payload;\nlet optionalsIterationLength = ruleArr.length;\noptionalsCreator(ruleArr[iterateOptionalsRule]);\nreturn;","outputs":1,"noerr":0,"initialize":"","finalize":"","x":810,"y":120,"wires":[["4d6f9aa6.a03db4","cdc5300f.0306c"]]},{"id":"6ba0d6a0.b6ada","type":"function","z":"a71fc6a3.13964","name":"ruleExpander","func":"let currentRule = 0;\nlet ruleObj = msg.payload;\nconst ruleKeys = Object.keys(ruleObj);\nfunction iterateRules (rule) {\n    if (ruleObj[ruleKeys[currentRule]].public) {\n        rule.unshift(\"(\");\n        rule.push(\")\");\n        for (i=0;i<rule.length;i++) {\n            if (rule[i].match(/\\<(.*)\\>/) !== null) {\n               let sub = rule[i].replace(\"<\",\"\").replace(\">\",\"\");\n               rule[i] = ruleObj[sub].rule;\n               rule[i].unshift(\"(\");\n               rule[i].push(\")\");\n            }\n        }\n        rule = rule.flat();\n        let test = false;\n        rule.forEach(token => {\n            if (token.match(/\\<(.*)\\>/) !== null) {\n                test = true;\n            }\n        });\n        ruleObj[ruleKeys[currentRule]].rule = rule;\n        if (test) {\n            iterateRules(ruleObj[ruleKeys[currentRule]].rule);\n        } else if (currentRule < ruleKeys.length - 1) {\n            currentRule += 1;\n            iterateRules(ruleObj[ruleKeys[currentRule]].rule);\n        } else {\n            makeIntermediateArr(ruleObj);\n        }\n    } else if (currentRule < ruleKeys.length - 1) {\n        currentRule += 1;\n        iterateRules(ruleObj[ruleKeys[currentRule]].rule);\n    } else {\n        makeIntermediateArr(ruleObj);\n    }\n}\nfunction makeIntermediateArr (inputObj) {\n    let intermediateArr = [];\n    for (var rule in inputObj) {\n        if (inputObj[rule].public) {\n            if (!Array.isArray(inputObj[rule].rule[0])) {\n                intermediateArr.push([inputObj[rule].rule]);\n            } else {\n                intermediateArr.push(inputObj[rule].rule);\n            }\n        }\n    }\n    intermediateArr = intermediateArr.flat(1);\n    node.send({payload:intermediateArr});\n}\niterateRules(ruleObj[ruleKeys[currentRule]].rule);\nreturn;","outputs":1,"noerr":0,"initialize":"","finalize":"","x":430,"y":120,"wires":[["e49fdb1e.c06768"]]},{"id":"4d6f9aa6.a03db4","type":"function","z":"a71fc6a3.13964","name":"unTokenizerPlusWeights","func":"let outputArr = [];\nlet combine = \"\";\nlet tags = \"\";\nconst addTags = env.get(\"tags\");\nconst weights = env.get(\"weights\");\nmsg.payload.forEach(sentence => {\n    if (Array.isArray(sentence)) {\n        sentence.forEach(token => {\n            if (token !== \"(\" && token !== \")\") {\n                if (token.match(/\\{(.*)\\}/g) !== null) {\n                    switch (addTags) {\n                        case \"remove\":\n                            break;\n                        case \"leave\":\n                            combine = combine + token + \" \";\n                            break;\n                        case \"left\":\n                            tags = tags + token + \";\";\n                            break;\n                        case \"split\":\n                            break;\n                    }\n                } else {\n                    combine = combine + token + \" \";\n                }\n            }\n        });\n        if (tags !== \"\") {\n            tags = tags.slice(0,-1).replace(/[\\{\\}]/g,\"\");\n            combine = tags + \":\" + combine;\n        }\n        if (combine !== \"\") {\n            combine = combine.trim();\n            if (!outputArr.includes(combine)) { outputArr.push(combine); }\n        }\n        combine = \"\";\n        tags = \"\";\n    }\n});\nif (weights) {\n    let newOutputArr = [];\n    for (i=0;i<outputArr.length;i++) {\n        let factor = 1;\n        if (outputArr[i].match(/\\/[0-9]?[\\.0-9]*\\//g) !== null) {\n            let factorRaw = outputArr[i].match(/\\/[0-9]?[\\.0-9]*\\//g);\n            if (factorRaw.length === 1) {\n                factor = outputArr[i].match(/\\/[0-9]?[\\.0-9]*\\//g)[0];\n                factor = Math.ceil(Number(factor.replace(/\\/*/g,\"\")));\n            } else {\n                factorRaw.forEach((part,index) => {\n                    factorRaw[index] = Math.ceil(Number(part.replace(/\\/*/g,\"\")));\n                });\n                for (let b=0;b<factorRaw.length;b++) {\n                    factor = factor * factorRaw[b];\n                }\n            }\n            \n            outputArr[i] = outputArr[i].replace(/\\/[0-9]?[\\.0-9]*\\/\\s/g, \"\");\n            for(let a=0;a<factor;a++) {\n                newOutputArr.push(outputArr[i]);\n            }\n        } else {\n            newOutputArr.push(outputArr[i]);\n        }\n    }\n    outputArr = newOutputArr;\n}\nif (env.get(\"outputType\") === \"string\") {\n    let outputString = \"\";\n    outputArr.forEach(sentence => {\n        outputString = outputString + sentence + \"\\n\";\n    });\n    msg.payload = outputString;\n} else {\n    msg.payload = outputArr;\n}\nconst ruleName = flow.get(\"ruleName\");\nif (ruleName) {\n    msg.topic = ruleName;\n}\nreturn msg;","outputs":1,"noerr":0,"initialize":"","finalize":"","x":1030,"y":120,"wires":[["be781da.d15a96"]]},{"id":"e49fdb1e.c06768","type":"function","z":"a71fc6a3.13964","name":"ruleAlternatives","func":"let iterateAlternativesRule = 0;\nlet toDo = [];\nlet output = [];\nlet iterations = 998;\nfunction makeAlternatives (inputTokens) {\n    if (alternativesChecker(inputTokens)) {\n        let firstSeparator = 0;\n        let start = 0;\n        let end = inputTokens.length;\n        let levelParentheses = 1;\n        for (i=0;i<inputTokens.length;i++) {\n            if (inputTokens[i] === \"|\") {\n                firstSeparator = i;\n                break;\n            }\n        }\n        for (i=firstSeparator-1;i>=0;i--) {\n            if (inputTokens[i] === \")\") {\n                levelParentheses += 1;\n            }\n            if (inputTokens[i] === \"(\") {\n                levelParentheses -= 1;\n            }\n            if (levelParentheses === 0) {\n                start = i;\n                break;\n            } \n        }\n        levelParentheses = 1;\n        for (i=firstSeparator+1;i<inputTokens.length;i++) {\n            if (inputTokens[i] === \"(\") {\n                levelParentheses += 1;\n            }\n            if (inputTokens[i] === \")\") {\n                levelParentheses -= 1;\n            }\n            if (levelParentheses === 0) {\n                end = i;\n                break;\n            }\n        }\n        levelParentheses = 0;\n        let previousSeparator = start;\n        let alternativesArr = [];\n        for (i=start;i<=end;i++) {\n            if (inputTokens[i] === \"(\") {\n                levelParentheses += 1;\n            }\n            if (inputTokens[i] === \")\") {\n                levelParentheses -= 1;\n            }\n            if ((inputTokens[i] === \"|\" && levelParentheses === 1) || levelParentheses === 0) {\n                let thisAlternative = inputTokens.slice(previousSeparator + 1, i);\n                alternativesArr.push(thisAlternative);\n                previousSeparator = i;\n            }\n        }\n        let alternativeBlockLength = end - start + 1;\n        let outputAlternatives = [];\n        for (i=0;i<alternativesArr.length;i++) {\n            let intermediate = [...inputTokens];\n            intermediate.splice(start, alternativeBlockLength,alternativesArr[i]);\n            intermediate = intermediate.flat();\n            outputAlternatives.push(intermediate);\n        }\n        outputAlternatives.forEach(alternative => {\n            if (alternativesChecker(alternative)) {\n                if (!arrayChecker(alternative,toDo)) {\n                    toDo.push(alternative);\n                }\n            } else {\n                output.push(alternative);\n            }\n        });\n        if (toDo.length === 0) {\n            if (iterateAlternativesRule < alternativesIterationLength - 1) {\n                iterateAlternativesRule += 1;\n                toDo.push(ruleArr[iterateAlternativesRule]);\n                nextOne();\n            } else {\n                node.send({payload:output});\n            }\n        } else {\n            nextOne();\n        }\n    } else {\n        output.push(inputTokens);\n        if (iterateAlternativesRule < alternativesIterationLength - 1) {\n            iterateAlternativesRule += 1;\n            toDo.push(ruleArr[iterateAlternativesRule]);\n            nextOne();\n        } else {\n            node.send({payload:output});\n        }\n    }\n    return;\n}\nfunction alternativesChecker (inputArr) {\n    if (inputArr.includes(\"|\")) {\n        return true;\n    }\n    return false;\n}\nfunction arrayChecker (arr1, arr2) {\n    let check = false;\n    for (i=0;i<arr2.length;i++) {\n        if (arr1.length === arr2[i].length && arr1.every((item,index) => {return item === arr2[i][index];})) {\n            check = true;\n            break;\n        }\n    }\n    return check;\n}\nfunction nextOne () {\n    iterations += 1;\n    let next = toDo.shift();\n    if (Array.isArray(next)) {\n        if (iterations >= 1000) {\n            iterations = 0;\n            setTimeout(()=>{\n                makeAlternatives(next);\n            },0);\n        } else {\n            makeAlternatives(next);\n        }\n    }\n    return;\n}\nlet ruleArr = msg.payload\nlet alternativesIterationLength = ruleArr.length;\nmakeAlternatives(ruleArr[iterateAlternativesRule]);\nreturn;","outputs":1,"noerr":0,"initialize":"","finalize":"","x":620,"y":120,"wires":[["77f7190d.b0eef"]]},{"id":"a981f80e.432d8","type":"change","z":"a71fc6a3.13964","name":"","rules":[{"t":"set","p":"payload","pt":"msg","to":"processing...","tot":"str"}],"action":"","property":"","from":"","to":"","reg":false,"x":260,"y":180,"wires":[[]]},{"id":"be781da.d15a96","type":"change","z":"a71fc6a3.13964","name":"","rules":[{"t":"set","p":"payload","pt":"msg","to":"","tot":"str"}],"action":"","property":"","from":"","to":"","reg":false,"x":1000,"y":180,"wires":[[]]},{"id":"9d1669a5.171c18","type":"function","z":"a71fc6a3.13964","name":"unTokenizer","func":"let outputArr = [];\nlet combine = \"\";\nlet tags = \"\";\nconst addTags = env.get(\"tags\");\nmsg.payload.forEach(sentence => {\n    if (Array.isArray(sentence)) {\n        sentence.forEach(token => {\n            if (token !== \"(\" && token !== \")\") {\n                if (token.match(/\\{(.*)\\}/g) !== null) {\n                    switch (addTags) {\n                        case \"remove\":\n                            break;\n                        case \"leave\":\n                            combine = combine + token + \" \";\n                            break;\n                        case \"left\":\n                            tags = tags + token + \";\";\n                            break;\n                        case \"split\":\n                            tags = tags + token + \";\";\n                            break;\n                    }\n                } else {\n                    combine = combine + token + \" \";\n                }\n            }\n        });\n        if (tags !== \"\") {\n            tags = tags.slice(0,-1).replace(/[\\{\\}]/g,\"\");\n            combine = tags + \":\" + combine;\n        }\n        if (combine !== \"\") {\n            combine = combine.trim();\n            if (!outputArr.includes(combine)) { outputArr.push(combine); }\n        }\n        combine = \"\";\n        tags = \"\";\n    }\n});\nif (env.get(\"outputType\") === \"string\") {\n    let outputString = \"\";\n    outputArr.forEach(sentence => {\n        outputString = outputString + sentence + \"\\n\";\n    });\n    msg.payload = outputString;\n} else {\n    msg.payload = outputArr;\n}\nconst ruleName = flow.get(\"ruleName\");\nif (ruleName) {\n    msg.topic = ruleName;\n}\nreturn msg;","outputs":1,"noerr":0,"initialize":"","finalize":"","x":1130,"y":60,"wires":[[]]},{"id":"cdc5300f.0306c","type":"switch","z":"a71fc6a3.13964","name":"","property":"tags","propertyType":"env","rules":[{"t":"eq","v":"split","vt":"str"}],"checkall":"true","repair":false,"outputs":1,"x":970,"y":60,"wires":[["9d1669a5.171c18"]]},{"id":"2fb3f381.45d33c","type":"subflow:a71fc6a3.13964","z":"915ebe2f.a456a8","name":"","env":[{"name":"outputType","value":"string","type":"str"},{"name":"weights","type":"bool","value":"true"}],"x":580,"y":60,"wires":[["e1d96124.681288","d53d59f5.82edf"],[]]}]

Flow Info

Created 4 years, 7 months ago

Updated 4 years, 2 months ago

Rating: not yet rated

view on github

Owner

johanneskropf

Actions

share flow

Node Types

Core

change (x2)
function (x6)
switch (x1)

Other

subflow (x1)
subflow:a71fc6a3.13964 (x1)