simple jsgf permutator subflow

About

This subflow can parse all possible sentences / permutations from a jsgf grammar file and output them as an array of sentences. This can for example be useful to quickly generate a text corpus to train a language model for a speech recognition system like Deepspeech or Kaldi. It can also be used to rig a simple nlu / intent recognition system together with node-red-contrib-fuzzywuzzy. This is done by adding tags to the jsgf grammar and than choosing move tags to the left in the subflow menu. This subflow only uses function nodes and has zero dependencies.

Usage

To create all possible sentences pass in the jsgf file content as a string in the msg.payload. The subflow will than output all permutations of the public rule sentences described by the grammar as an array of sentences in the msg.payload. It can also be configured to output the sentences as a single string with one sentence per line. This output will be to the first output of the subflow.

Tags

If you grammar includes tags you can choose wether to remove them from the final sentences if you want a clean corpus to train a language model, leave them or move them to the left with a seperator(:) which can be useful for intent recognition with node-red-contrib-fuzzywuzzy. You can also choose the option to output a clean version to output 1 for language model training and a version with the tags moved to the left to output two for fuzzy intent recognition in one go.

Weights

The subflow does support a simplified version of weights. This is opt in. If you dont want weights to be ignored select the check mark for use weights in the menu. In difference to the full jsgf standard this subflow due to its deterministic nature only supports whole numbers as weights. A possible sentence will appear once in the result with either no weight or a weight of 1. The weight will work as a multiplication factor if bigger than 1. So an alternative with a weight of 5 for example will appear 5 times in the result. This can be used to fine tune a corpus and its likelyhoods for a language model for example.

Restrictions

Right now the subflow only supports a subset of the jsgf syntax. It supports:

  • optionals
  • alternatives and whole number weights
  • public & private rules
  • single line comments
  • tags
  • multiline rules

It doesn‘t support:

  • recursive rules
  • weights that are not whole numbers
  • imports
  • Kleene Star(*) or Plus(+) operators
  • multi line comments

There may be other features missing as this is still work in progress.

Things to look out for

  • always wrap alternatives in parenthese
  • surround tags with spaces towards surrounding elements
  • end all rules with a semicolon (;) for correct parsing
  • dont include infinite loops (recursions) in your grammar
  • be aware that every unique sentence will only ever be output once even if it appears multiple times in the grammar
  • processing of large and/or complex grammars can take anywhere from seconds to minutes
[{"id":"a71fc6a3.13964","type":"subflow","name":"jsgf permutator","info":"","category":"","in":[{"x":60,"y":120,"wires":[{"id":"64935c66.d7f52c"},{"id":"a981f80e.432d8"}]}],"out":[{"x":1200,"y":120,"wires":[{"id":"4d6f9aa6.a03db4","port":0}]},{"x":1260,"y":60,"wires":[{"id":"9d1669a5.171c18","port":0}]}],"env":[{"name":"tags","type":"str","value":"remove","ui":{"icon":"font-awesome/fa-bookmark-o","label":{"en-US":"Tags"},"type":"select","opts":{"opts":[{"l":{"en-US":"remove"},"v":"remove"},{"l":{"en-US":"leave"},"v":"leave"},{"l":{"en-US":"move to left"},"v":"left"},{"l":{"en-US":"split: remove/left"},"v":"split"}]}}},{"name":"outputType","type":"str","value":"array","ui":{"label":{"en-US":"output as"},"type":"select","opts":{"opts":[{"l":{"en-US":"Array"},"v":"array"},{"l":{"en-US":"String"},"v":"string"}]}}},{"name":"weights","type":"bool","value":"false","ui":{"label":{"en-US":"use weights"},"type":"checkbox"}}],"color":"#E7E7AE","icon":"node-red/split.svg","status":{"x":1160,"y":220,"wires":[{"id":"a981f80e.432d8","port":0},{"id":"be781da.d15a96","port":0}]}},{"id":"64935c66.d7f52c","type":"function","z":"a71fc6a3.13964","name":"ruleTokenizer","func":"const weights = env.get(\"weights\");\nfunction ruleTokenizer (rule) {\n    let ruleArrRaw = rule.split(\"\");\n    let ruleArr = [];\n    let outputToken = \"\";\n    ruleArrRaw.forEach(token => {\n        if (token.match(/[a-zA-Z0-9\\ä\\ö\\ü\\ß\\-\\_\\{\\}\\<\\>\\/]/) !== null) {\n            outputToken += token;\n        } else if (token.match(/[\\[\\]\\|\\(\\)]/) !== null) {\n            if (outputToken.length !== 0) {\n                ruleArr.push(outputToken);\n                outputToken = \"\";\n            }\n            ruleArr.push(token);\n        } else if (token.match(/\\s/) !== null) {\n            if (outputToken.length !== 0) {\n                ruleArr.push(outputToken);\n                outputToken = \"\";\n            }\n        }\n    });\n    if (outputToken.length !== 0) {\n        ruleArr.push(outputToken);\n    }\n    return ruleArr;\n}\nlet rawRules = msg.payload.replace(/\\/\\/(.*)\\r?\\n|\\r/g, \" \").replace(/\\r?\\n|\\r/g, \" \").replace(/\\s{2,}/g, \" \");\nif (!weights) {\n    rawRules = rawRules.replace(/\\/[0-9]?[\\.0-9]*\\//g, \"\");\n}\nrawRules = rawRules.split(\";\").map(rule => rule.trim()).filter(rule => rule.length > 0);\nlet ruleName = rawRules.filter(rule => rule.match(/^grammar\\s/g))[0].replace(/^grammar\\s/g, \"\");\n(ruleName) ? flow.set(\"ruleName\", ruleName) : flow.set(\"ruleName\", false);\nrawRules = rawRules.filter(rule => rule.match(/\\=/g));\nlet rulesObj = {};\nrawRules.forEach(rule => {\n    rule = rule.split(\"=\").map(rule => rule.trim());\n    let isPublic =(rule[0].match(/^public/g) !== null) ? true : false;\n    let name = rule[0].match(/\\<(.+)\\>/g).toString().replace(/\\<|\\>/g, \"\");\n    rulesObj[name] = {\n        rule: ruleTokenizer(rule[1]),\n        public: isPublic\n    }\n});\nmsg.payload = rulesObj;\nreturn msg;","outputs":1,"noerr":0,"initialize":"","finalize":"","x":250,"y":120,"wires":[["6ba0d6a0.b6ada"]]},{"id":"77f7190d.b0eef","type":"function","z":"a71fc6a3.13964","name":"ruleOptionals","func":"let finalResults = [];\nlet iterateOptionalsRule = 0;\nlet toDo = [];\nlet iterations = 998;\nfunction optionalsCreator (tokens) {\n    if (optionalsChecker(tokens)) {\n        let check = tokens.indexOf(\"[\");\n        let level = 1;\n        let withIt = [];\n        let withoutIt = [];\n        let results = [];\n        for (i = check+1; i < tokens.length; i++) {\n            if (tokens[i] === \"]\") {\n                level -= 1;\n            } else if (tokens[i] === \"[\") {\n                level += 1;\n            }\n            if (level === 0) {\n                withIt = tokens.filter((token,index) => index !== check && index !== i);\n                withoutIt = tokens.filter((token,index) => index < check || index > i);\n                break;\n            }\n        }\n        results.push(withIt, withoutIt);\n        results.forEach(result => {\n            if (optionalsChecker(result)) {\n                toDo.push(result);\n            } else {\n                finalResults.push(result);\n            }\n        });\n        if (toDo.length === 0) {\n            if (iterateOptionalsRule < optionalsIterationLength - 1) {\n                iterateOptionalsRule += 1;\n                toDo.push(ruleArr[iterateOptionalsRule]);\n                nextOne();\n            } else {\n                node.send({payload:finalResults});\n            }\n        } else {\n            nextOne();\n        }\n    } else {\n        finalResults.push(tokens);\n        if (iterateOptionalsRule < optionalsIterationLength - 1) {\n            iterateOptionalsRule += 1;\n            toDo.push(ruleArr[iterateOptionalsRule]);\n            nextOne();\n        } else {\n            node.send({payload:finalResults});\n        }\n    }\n    return;\n}\nfunction optionalsChecker (inputArray) {\n    if (inputArray.includes(\"[\") && inputArray.includes(\"]\")) {\n        return true;\n    }\n    return false;\n}\nfunction nextOne () {\n    iterations += 1;\n    let next = toDo.shift();\n    if (Array.isArray(next)) {\n        if (iterations >= 1000) {\n            iterations = 0;\n            setTimeout(()=>{\n                optionalsCreator(next);\n            },0);\n        } else {\n            optionalsCreator(next);\n        }\n    }\n    return;\n}\nlet ruleArr = msg.payload;\nlet optionalsIterationLength = ruleArr.length;\noptionalsCreator(ruleArr[iterateOptionalsRule]);\nreturn;","outputs":1,"noerr":0,"initialize":"","finalize":"","x":810,"y":120,"wires":[["4d6f9aa6.a03db4","cdc5300f.0306c"]]},{"id":"6ba0d6a0.b6ada","type":"function","z":"a71fc6a3.13964","name":"ruleExpander","func":"let currentRule = 0;\nlet ruleObj = msg.payload;\nconst ruleKeys = Object.keys(ruleObj);\nfunction iterateRules (rule) {\n    if (ruleObj[ruleKeys[currentRule]].public) {\n        rule.unshift(\"(\");\n        rule.push(\")\");\n        for (i=0;i<rule.length;i++) {\n            if (rule[i].match(/\\<(.*)\\>/) !== null) {\n               let sub = rule[i].replace(\"<\",\"\").replace(\">\",\"\");\n               rule[i] = ruleObj[sub].rule;\n               rule[i].unshift(\"(\");\n               rule[i].push(\")\");\n            }\n        }\n        rule = rule.flat();\n        let test = false;\n        rule.forEach(token => {\n            if (token.match(/\\<(.*)\\>/) !== null) {\n                test = true;\n            }\n        });\n        ruleObj[ruleKeys[currentRule]].rule = rule;\n        if (test) {\n            iterateRules(ruleObj[ruleKeys[currentRule]].rule);\n        } else if (currentRule < ruleKeys.length - 1) {\n            currentRule += 1;\n            iterateRules(ruleObj[ruleKeys[currentRule]].rule);\n        } else {\n            makeIntermediateArr(ruleObj);\n        }\n    } else if (currentRule < ruleKeys.length - 1) {\n        currentRule += 1;\n        iterateRules(ruleObj[ruleKeys[currentRule]].rule);\n    } else {\n        makeIntermediateArr(ruleObj);\n    }\n}\nfunction makeIntermediateArr (inputObj) {\n    let intermediateArr = [];\n    for (var rule in inputObj) {\n        if (inputObj[rule].public) {\n            if (!Array.isArray(inputObj[rule].rule[0])) {\n                intermediateArr.push([inputObj[rule].rule]);\n            } else {\n                intermediateArr.push(inputObj[rule].rule);\n            }\n        }\n    }\n    intermediateArr = intermediateArr.flat(1);\n    node.send({payload:intermediateArr});\n}\niterateRules(ruleObj[ruleKeys[currentRule]].rule);\nreturn;","outputs":1,"noerr":0,"initialize":"","finalize":"","x":430,"y":120,"wires":[["e49fdb1e.c06768"]]},{"id":"4d6f9aa6.a03db4","type":"function","z":"a71fc6a3.13964","name":"unTokenizerPlusWeights","func":"let outputArr = [];\nlet combine = \"\";\nlet tags = \"\";\nconst addTags = env.get(\"tags\");\nconst weights = env.get(\"weights\");\nmsg.payload.forEach(sentence => {\n    if (Array.isArray(sentence)) {\n        sentence.forEach(token => {\n            if (token !== \"(\" && token !== \")\") {\n                if (token.match(/\\{(.*)\\}/g) !== null) {\n                    switch (addTags) {\n                        case \"remove\":\n                            break;\n                        case \"leave\":\n                            combine = combine + token + \" \";\n                            break;\n                        case \"left\":\n                            tags = tags + token + \";\";\n                            break;\n                        case \"split\":\n                            break;\n                    }\n                } else {\n                    combine = combine + token + \" \";\n                }\n            }\n        });\n        if (tags !== \"\") {\n            tags = tags.slice(0,-1).replace(/[\\{\\}]/g,\"\");\n            combine = tags + \":\" + combine;\n        }\n        if (combine !== \"\") {\n            combine = combine.trim();\n            if (!outputArr.includes(combine)) { outputArr.push(combine); }\n        }\n        combine = \"\";\n        tags = \"\";\n    }\n});\nif (weights) {\n    let newOutputArr = [];\n    for (i=0;i<outputArr.length;i++) {\n        let factor = 1;\n        if (outputArr[i].match(/\\/[0-9]?[\\.0-9]*\\//g) !== null) {\n            let factorRaw = outputArr[i].match(/\\/[0-9]?[\\.0-9]*\\//g);\n            if (factorRaw.length === 1) {\n                factor = outputArr[i].match(/\\/[0-9]?[\\.0-9]*\\//g)[0];\n                factor = Math.ceil(Number(factor.replace(/\\/*/g,\"\")));\n            } else {\n                factorRaw.forEach((part,index) => {\n                    factorRaw[index] = Math.ceil(Number(part.replace(/\\/*/g,\"\")));\n                });\n                for (let b=0;b<factorRaw.length;b++) {\n                    factor = factor * factorRaw[b];\n                }\n            }\n            \n            outputArr[i] = outputArr[i].replace(/\\/[0-9]?[\\.0-9]*\\/\\s/g, \"\");\n            for(let a=0;a<factor;a++) {\n                newOutputArr.push(outputArr[i]);\n            }\n        } else {\n            newOutputArr.push(outputArr[i]);\n        }\n    }\n    outputArr = newOutputArr;\n}\nif (env.get(\"outputType\") === \"string\") {\n    let outputString = \"\";\n    outputArr.forEach(sentence => {\n        outputString = outputString + sentence + \"\\n\";\n    });\n    msg.payload = outputString;\n} else {\n    msg.payload = outputArr;\n}\nconst ruleName = flow.get(\"ruleName\");\nif (ruleName) {\n    msg.topic = ruleName;\n}\nreturn msg;","outputs":1,"noerr":0,"initialize":"","finalize":"","x":1030,"y":120,"wires":[["be781da.d15a96"]]},{"id":"e49fdb1e.c06768","type":"function","z":"a71fc6a3.13964","name":"ruleAlternatives","func":"let iterateAlternativesRule = 0;\nlet toDo = [];\nlet output = [];\nlet iterations = 998;\nfunction makeAlternatives (inputTokens) {\n    if (alternativesChecker(inputTokens)) {\n        let firstSeparator = 0;\n        let start = 0;\n        let end = inputTokens.length;\n        let levelParentheses = 1;\n        for (i=0;i<inputTokens.length;i++) {\n            if (inputTokens[i] === \"|\") {\n                firstSeparator = i;\n                break;\n            }\n        }\n        for (i=firstSeparator-1;i>=0;i--) {\n            if (inputTokens[i] === \")\") {\n                levelParentheses += 1;\n            }\n            if (inputTokens[i] === \"(\") {\n                levelParentheses -= 1;\n            }\n            if (levelParentheses === 0) {\n                start = i;\n                break;\n            } \n        }\n        levelParentheses = 1;\n        for (i=firstSeparator+1;i<inputTokens.length;i++) {\n            if (inputTokens[i] === \"(\") {\n                levelParentheses += 1;\n            }\n            if (inputTokens[i] === \")\") {\n                levelParentheses -= 1;\n            }\n            if (levelParentheses === 0) {\n                end = i;\n                break;\n            }\n        }\n        levelParentheses = 0;\n        let previousSeparator = start;\n        let alternativesArr = [];\n        for (i=start;i<=end;i++) {\n            if (inputTokens[i] === \"(\") {\n                levelParentheses += 1;\n            }\n            if (inputTokens[i] === \")\") {\n                levelParentheses -= 1;\n            }\n            if ((inputTokens[i] === \"|\" && levelParentheses === 1) || levelParentheses === 0) {\n                let thisAlternative = inputTokens.slice(previousSeparator + 1, i);\n                alternativesArr.push(thisAlternative);\n                previousSeparator = i;\n            }\n        }\n        let alternativeBlockLength = end - start + 1;\n        let outputAlternatives = [];\n        for (i=0;i<alternativesArr.length;i++) {\n            let intermediate = [...inputTokens];\n            intermediate.splice(start, alternativeBlockLength,alternativesArr[i]);\n            intermediate = intermediate.flat();\n            outputAlternatives.push(intermediate);\n        }\n        outputAlternatives.forEach(alternative => {\n            if (alternativesChecker(alternative)) {\n                if (!arrayChecker(alternative,toDo)) {\n                    toDo.push(alternative);\n                }\n            } else {\n                output.push(alternative);\n            }\n        });\n        if (toDo.length === 0) {\n            if (iterateAlternativesRule < alternativesIterationLength - 1) {\n                iterateAlternativesRule += 1;\n                toDo.push(ruleArr[iterateAlternativesRule]);\n                nextOne();\n            } else {\n                node.send({payload:output});\n            }\n        } else {\n            nextOne();\n        }\n    } else {\n        output.push(inputTokens);\n        if (iterateAlternativesRule < alternativesIterationLength - 1) {\n            iterateAlternativesRule += 1;\n            toDo.push(ruleArr[iterateAlternativesRule]);\n            nextOne();\n        } else {\n            node.send({payload:output});\n        }\n    }\n    return;\n}\nfunction alternativesChecker (inputArr) {\n    if (inputArr.includes(\"|\")) {\n        return true;\n    }\n    return false;\n}\nfunction arrayChecker (arr1, arr2) {\n    let check = false;\n    for (i=0;i<arr2.length;i++) {\n        if (arr1.length === arr2[i].length && arr1.every((item,index) => {return item === arr2[i][index];})) {\n            check = true;\n            break;\n        }\n    }\n    return check;\n}\nfunction nextOne () {\n    iterations += 1;\n    let next = toDo.shift();\n    if (Array.isArray(next)) {\n        if (iterations >= 1000) {\n            iterations = 0;\n            setTimeout(()=>{\n                makeAlternatives(next);\n            },0);\n        } else {\n            makeAlternatives(next);\n        }\n    }\n    return;\n}\nlet ruleArr = msg.payload\nlet alternativesIterationLength = ruleArr.length;\nmakeAlternatives(ruleArr[iterateAlternativesRule]);\nreturn;","outputs":1,"noerr":0,"initialize":"","finalize":"","x":620,"y":120,"wires":[["77f7190d.b0eef"]]},{"id":"a981f80e.432d8","type":"change","z":"a71fc6a3.13964","name":"","rules":[{"t":"set","p":"payload","pt":"msg","to":"processing...","tot":"str"}],"action":"","property":"","from":"","to":"","reg":false,"x":260,"y":180,"wires":[[]]},{"id":"be781da.d15a96","type":"change","z":"a71fc6a3.13964","name":"","rules":[{"t":"set","p":"payload","pt":"msg","to":"","tot":"str"}],"action":"","property":"","from":"","to":"","reg":false,"x":1000,"y":180,"wires":[[]]},{"id":"9d1669a5.171c18","type":"function","z":"a71fc6a3.13964","name":"unTokenizer","func":"let outputArr = [];\nlet combine = \"\";\nlet tags = \"\";\nconst addTags = env.get(\"tags\");\nmsg.payload.forEach(sentence => {\n    if (Array.isArray(sentence)) {\n        sentence.forEach(token => {\n            if (token !== \"(\" && token !== \")\") {\n                if (token.match(/\\{(.*)\\}/g) !== null) {\n                    switch (addTags) {\n                        case \"remove\":\n                            break;\n                        case \"leave\":\n                            combine = combine + token + \" \";\n                            break;\n                        case \"left\":\n                            tags = tags + token + \";\";\n                            break;\n                        case \"split\":\n                            tags = tags + token + \";\";\n                            break;\n                    }\n                } else {\n                    combine = combine + token + \" \";\n                }\n            }\n        });\n        if (tags !== \"\") {\n            tags = tags.slice(0,-1).replace(/[\\{\\}]/g,\"\");\n            combine = tags + \":\" + combine;\n        }\n        if (combine !== \"\") {\n            combine = combine.trim();\n            if (!outputArr.includes(combine)) { outputArr.push(combine); }\n        }\n        combine = \"\";\n        tags = \"\";\n    }\n});\nif (env.get(\"outputType\") === \"string\") {\n    let outputString = \"\";\n    outputArr.forEach(sentence => {\n        outputString = outputString + sentence + \"\\n\";\n    });\n    msg.payload = outputString;\n} else {\n    msg.payload = outputArr;\n}\nconst ruleName = flow.get(\"ruleName\");\nif (ruleName) {\n    msg.topic = ruleName;\n}\nreturn msg;","outputs":1,"noerr":0,"initialize":"","finalize":"","x":1130,"y":60,"wires":[[]]},{"id":"cdc5300f.0306c","type":"switch","z":"a71fc6a3.13964","name":"","property":"tags","propertyType":"env","rules":[{"t":"eq","v":"split","vt":"str"}],"checkall":"true","repair":false,"outputs":1,"x":970,"y":60,"wires":[["9d1669a5.171c18"]]},{"id":"2fb3f381.45d33c","type":"subflow:a71fc6a3.13964","z":"915ebe2f.a456a8","name":"","env":[{"name":"outputType","value":"string","type":"str"},{"name":"weights","type":"bool","value":"true"}],"x":580,"y":60,"wires":[["e1d96124.681288","d53d59f5.82edf"],[]]}]

Flow Info

Created 3 years, 2 months ago
Updated 2 years, 9 months ago
Rating: not yet rated

Actions

Rate:

Node Types

Core
  • change (x2)
  • function (x6)
  • switch (x1)
Other
  • subflow (x1)
  • subflow:a71fc6a3.13964 (x1)

Tags

  • stt
  • jsgf
  • speech-recognition
  • asr
  • nlu
  • corpus
  • language-model
Copy this flow JSON to your clipboard and then import into Node-RED using the Import From > Clipboard (Ctrl-I) menu option