Automating Web Scraping, LLM Data Transformation using LLaMA 3.1 on groq.com
Overview:
Web Scraping:
- The flow begins with a scheduled trigger every Saturday at 7:00 AM. It scrapes a webpage and extracts the body text.
Data Cleaning and Transformation:
- The scraped HTML content is cleaned and transformed into JSON objects. This is done using a LLM (LLaMA 3.1 via Groq), which formats the raw data into structured JSON.
Distance Calculation:
- The JSON objects are looped through, and for each object, the address is set, and parameters are configured for the Google Distance Matrix API.
- The API calculates the distance for each object, and only those objects that meet specific criteria are retained.
HTML Email Generation:
- Once the relevant objects are filtered, the flow generates an HTML email content using LLaMA 3.1 again and sends the email via a configured SMTP node.
Error Handling:
- The flow includes error handling logic to catch issues related to the Google API and notify via a connected Home Assistant instance if something goes wrong.
Key Components:
- LLaMA 3.1: Used twice in the flow, once for transforming the scraped data into JSON, and again for generating the HTML email content.
- Google Distance Matrix API: Responsible for calculating distances, helping filter objects based on geographic relevance.
- Node-RED: Orchestrating the entire process, from scraping to email dispatch, with clean error handling.
This flow demonstrates how Node-RED can be leveraged to create sophisticated workflows that combine external APIs, advanced AI models, and robust automation features.
[{"id":"8899d0af1bbdfc27","type":"tab","label":"Permis construire alerte","disabled":false,"info":""},{"id":"d0d50336b4cb2c50","type":"inject","z":"8899d0af1bbdfc27","name":"Samedi 7h00","props":[{"p":"payload"},{"p":"topic","vt":"str"}],"repeat":"","crontab":"00 07 * * 6","once":false,"onceDelay":0.1,"topic":"","payload":"","payloadType":"date","x":140,"y":100,"wires":[["e894160976c800e6"]]},{"id":"e894160976c800e6","type":"http request","z":"8899d0af1bbdfc27","name":"Scrap web page","method":"GET","ret":"txt","paytoqs":"ignore","url":"https://www.neuchatelville.ch/enquetes-publiques","tls":"","persist":false,"proxy":"","insecureHTTPParser":false,"authType":"","senderr":false,"headers":[],"x":380,"y":120,"wires":[["aa223496fbecbe26"]]},{"id":"aa223496fbecbe26","type":"cheerio-function","z":"8899d0af1bbdfc27","name":"ce-bodytext","func":"//var nextAll = $('div.ce-bodytext').eq(1).nextAll();\n//var bodytext = ${nextAll.text()};\n\n// @ts-ignore\nmsg.payload = $('div.ce-bodytext').eq(1);\n\n\n//*[@id=\"c14948\"]/div/div\nreturn msg;","outputs":1,"noerr":0,"x":570,"y":120,"wires":[["ba397d4b29263c74"]]},{"id":"a77124e989d217a6","type":"OpenAI API","z":"8899d0af1bbdfc27","name":"Groq","property":"payload","propertyType":"msg","service":"716812fe2033d25c","method":"createChatCompletion","x":650,"y":260,"wires":[["9c009603ee4a68bb","bb23e3241e1a1ff5"]]},{"id":"9c009603ee4a68bb","type":"debug","z":"8899d0af1bbdfc27","name":"msg content","active":false,"tosidebar":true,"console":false,"tostatus":false,"complete":"payload.choices[0].message.content","targetType":"msg","statusVal":"","statusType":"auto","x":870,"y":260,"wires":[]},{"id":"cb6783146b04907c","type":"change","z":"8899d0af1bbdfc27","name":"prompt & model setting","rules":[{"t":"set","p":"payload","pt":"msg","to":"{\t \"messages\":[\t {\t \"role\":\"system\",\t \"content\":\"You generate output in JSON format only, with no further explanatory text. Generate a JSON with this structure : {\\\"enquetes\\\":[{\\\"dossier_number\\\":\\\"116183\\\",\\\"city\\\":\\\"Corcelles - Cormondrèche\\\",\\\"situation\\\":\\\"Rue de la Chapelle 14, Corcelles - Cormondrèche\\\",\\\"description\\\":\\\"Transformation du 1er étage et création d'un studio au rez-de-chaussée\\\",\\\"requerant\\\":\\\"Pierre-Yves Besancet\\\",\\\"auteur\\\":\\\"André Erard\\\",\\\"plans\\\":\\\"MONNIER-ERARD SA ARCHITECTES SIA DIPLOMES EPFL, 2000 NEUCHATEL\\\",\\\"distance\\\":\\\"\\\"}]}. I want an \\\"city\\\" element, whiche takes infos from \\\"Situation\\\" and from \\\"Parcelle(s) et coordonnées\\\", but only after the number and -. Exemple : from the text : Parcelle(s) et coordonnées: 4412 - Corcelles - Cormondrèche ; 4413/A - Corcelles - Cormondrèche ; I want only : Corcelles - Cormondrèche. Or another exemple: from the text : Parcelle(s) et coordonnées: 15604,14206 - Neuchâtel ; I want only : Neuchâtel. Add an empty filed \\\"distance\\\". Remove all object that are not from cities : Corcelles - Cormondrèche, or Neuchâtel. Keep only object from these two cities. \"\t },\t {\t \"role\":\"user\",\t \"content\":\"Generate an output JSON base on the following HTML : \\n***\" & payload &\"***\"\t }\t ],\t \"model\":\"llama-3.1-70b-versatile\",\t \"stream\":false\t}","tot":"jsonata"}],"action":"","property":"","from":"","to":"","reg":false,"x":410,"y":260,"wires":[["a77124e989d217a6"]]},{"id":"ba397d4b29263c74","type":"function","z":"8899d0af1bbdfc27","name":"clean html","func":"\n// transform as a string\nmsg.payload= msg.payload.toString('utf-8')\n\nmsg.payload = msg.payload.replace('<div class=\"ce-bodytext\">', '<div>');\n\n//.replace(' class=ce-bodytext', '');\n\nreturn msg;","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":750,"y":120,"wires":[["cb6783146b04907c"]]},{"id":"ba5ae90d07203061","type":"json","z":"8899d0af1bbdfc27","name":"","property":"payload","action":"","pretty":true,"x":650,"y":320,"wires":[["1a4f8e22de6670be","25067bfa6ea24dd4"]]},{"id":"87488059aaf01c86","type":"comment","z":"8899d0af1bbdfc27","name":"Transofrm in a JSON objects","info":"","x":920,"y":200,"wires":[]},{"id":"1a4f8e22de6670be","type":"debug","z":"8899d0af1bbdfc27","name":"clean json","active":false,"tosidebar":true,"console":false,"tostatus":false,"complete":"true","targetType":"full","statusVal":"","statusType":"auto","x":860,"y":320,"wires":[]},{"id":"25067bfa6ea24dd4","type":"loop","z":"8899d0af1bbdfc27","name":"","kind":"enum","count":"","initial":"1","step":"1","condition":"","conditionType":"js","when":"before","enumeration":"payload.enquetes","enumerationType":"msg","limit":"","loopPayload":"loop-index","finalPayload":"final-count","x":610,"y":480,"wires":[["719b2795a011cfea","b8b1603ad1052cd2"],["aa99facb435a05af"]]},{"id":"719b2795a011cfea","type":"debug","z":"8899d0af1bbdfc27","name":"end of Loop content","active":false,"tosidebar":true,"console":false,"tostatus":false,"complete":"true","targetType":"full","statusVal":"","statusType":"auto","x":900,"y":480,"wires":[]},{"id":"bb23e3241e1a1ff5","type":"change","z":"8899d0af1bbdfc27","name":"Get msg content","rules":[{"t":"set","p":"payload","pt":"msg","to":"payload.choices[0].message.content","tot":"msg"}],"action":"","property":"","from":"","to":"","reg":false,"x":390,"y":320,"wires":[["ba5ae90d07203061"]]},{"id":"aa99facb435a05af","type":"function","z":"8899d0af1bbdfc27","name":"Set address","func":"// Check if the structure exists\nif (msg.loop && msg.loop.value && msg.loop.value.situation && msg.loop.value.city) {\n // Log the city value using node.warn\n node.warn(\"Address value: \" + msg.loop.value.situation + \", \" + msg.loop.value.city);\n var address = msg.loop.value.situation + \", \" + msg.loop.value.city;\n flow.set('address', address);\n node.warn(\"address new \" + context.get('address'));\n\n} else {\n // Log a warning if the city or sitation is not found\n node.warn(\"Adrress not found in the message\");\n}\n\nreturn msg;","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":650,"y":560,"wires":[["db70b4270c628744"]]},{"id":"db70b4270c628744","type":"function","z":"8899d0af1bbdfc27","name":"Set API parameters","func":"\n// distancematrix/json?key=<yourapiKey>&destinations=<your-destination>&origins=<your-origin>&units=metric\n\nvar key = \"you key\";\nvar units = \"metric\";\n// Exemple :\n//var destinations = \"Your Address\"; \nvar destinations = flow.get('address')||0;\n\n// print in console the destination\n//node.warn(\"New destination : \" + destinations);\nif(msg.loop.value.city.includes(\"Corcelles\")) {\n var origins = \"my origin\";\n} else {\n var origins = \"Other origin\";\n}\n\n\nmsg.payload = {\n origins: origins,\n destinations: destinations,\n key: key\n};\n\nreturn msg;\n\n","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":730,"y":600,"wires":[["04369f17d2013e5c"]]},{"id":"04369f17d2013e5c","type":"http request","z":"8899d0af1bbdfc27","name":"Get distance","method":"GET","ret":"obj","paytoqs":"query","url":"https://maps.googleapis.com/maps/api/distancematrix/json","tls":"","persist":false,"proxy":"","insecureHTTPParser":false,"authType":"","senderr":false,"headers":[],"x":790,"y":640,"wires":[["0ce3b2eda780fcf7","c455c0ef674c44fa"]]},{"id":"0ce3b2eda780fcf7","type":"debug","z":"8899d0af1bbdfc27","name":"Distance m","active":false,"tosidebar":true,"console":false,"tostatus":false,"complete":"payload.rows[0].elements[0].distance.value","targetType":"msg","statusVal":"","statusType":"auto","x":1370,"y":640,"wires":[]},{"id":"c455c0ef674c44fa","type":"function","z":"8899d0af1bbdfc27","name":"keeps only object of interest add to objecstList","func":"// Initialize objectsList as an empty object if it does not exist\nvar objectsList = flow.get('objectsList') || {};\n\n// Extracting values from loop.value\nvar dossier_number = msg.loop.value.dossier_number;\nvar city = msg.loop.value.city;\nvar situation = msg.loop.value.situation;\nvar description = msg.loop.value.description;\nvar requerant = msg.loop.value.requerant;\nvar auteur = msg.loop.value.auteur;\nvar plans = msg.loop.value.plans || \"\"; // Valeur par défaut si non définie\nvar distance = msg.payload.rows[0].elements[0].distance.value; // Distance from message\n\n// Initialize object with extracted values\nvar newObject = {\n \"dossier_number\": dossier_number,\n \"city\": city,\n \"situation\": situation,\n \"description\": description,\n \"requerant\": requerant,\n \"auteur\": auteur,\n \"plans\": plans,\n \"distance\": \"\" // Distance initialized as an empty string\n};\n\n// Add distance only if less than 500\nif (distance < 500) {\n newObject.distance = distance.toString(); // Distance update\n}\n\n// Stores newObject in objectsList under a unique key\nobjectsList[dossier_number] = newObject; // Using dossier_number as a unique key\n\n// Saving updated objectsList in flow context\nflow.set('objectsList', objectsList);\n\nreturn msg;","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":940,"y":680,"wires":[["25067bfa6ea24dd4"]]},{"id":"b4505f38e96ec86b","type":"catch","z":"8899d0af1bbdfc27","name":"","scope":["04369f17d2013e5c","2d4cc3fb10088241"],"uncaught":false,"x":620,"y":1040,"wires":[["4efe6e067790ffbe","c759c58bdd1deac9"]]},{"id":"4efe6e067790ffbe","type":"debug","z":"8899d0af1bbdfc27","name":"debug google API","active":true,"tosidebar":true,"console":false,"tostatus":false,"complete":"true","targetType":"full","statusVal":"","statusType":"auto","x":850,"y":1020,"wires":[]},{"id":"d5543019cd3673b3","type":"comment","z":"8899d0af1bbdfc27","name":"Loop to JSON objects","info":"","x":200,"y":480,"wires":[]},{"id":"2d4cc3fb10088241","type":"e-mail","z":"8899d0af1bbdfc27","server":"smtp.server.com","port":"465","authtype":"BASIC","saslformat":true,"token":"oauth2Response.access_token","secure":true,"tls":true,"name":"[email protected]","dname":"Mail server","x":1410,"y":860,"wires":[]},{"id":"44be0301445794e1","type":"change","z":"8899d0af1bbdfc27","name":"prompt & model setting","rules":[{"t":"set","p":"payload","pt":"msg","to":"{\t \"messages\":[\t {\t \"role\":\"system\",\t \"content\":\"Tu es un assistant de rédaction d’email en html. Tu recevras un JSON contenant 1 ou plusieurs objets d’enquêtes publiques provenant du site de la ville de Neuchâtel: https://www.neuchatelville.ch/enquetes-publiques. Rédige un email court, en informant des enquêtes publiques à proximité. Puis, liste de manière lisible chaque objet en texte sous forme de liste mise en page. Sépare chaque objet par un espace. Exemple de liste des objets : ***Voici la liste des enquêtes publiques à proximité de chez nous :\\n• <b>Dossier 116978</b> : Installation d'un refroidisseur en toiture. \\nSitué à : Neuchâtel, Rue de Vieux-Châtel 22. \\nRequérant : Antoine Savary RHNe, 2000 Neuchâtel. \\nAuteur : Antoine Savary RHNe, Maladière 45, 2000 NEUCHATEL.\\n\\n\\n*** Ce mail est envoyé aux membres de la famille. Ecris en français. Retourne uniquement le contenu (body) de l'email, n’écris pas d’introduction, ni objet. Commence par : Bonjour à tous, . Renseigne le site internet afin que nous puissions le consulter et que le lien soit actif. En fin de mail, retour à la ligne, tu signes 'le petit robot automatique' avec un emoji. Formate le contenu en html pour email en ajoutant les retours à la ligne et des paragraphe <p>, Utilisez <!DOCTYPE html> pour indiquer à votre programme que vous écrivez un code HTML.\"\t },\t {\t \"role\":\"user\",\t \"content\":\"L'objet JSON des enquêtes publiques : \\n***\" & payload &\"***\\n retourne moi que le contenu de l'email.\"\t }\t ],\t \"model\":\"llama-3.1-70b-versatile\",\t \"stream\":false\t}","tot":"jsonata"}],"action":"","property":"","from":"","to":"","reg":false,"x":790,"y":860,"wires":[["9282f0c90819e265"]]},{"id":"9282f0c90819e265","type":"OpenAI API","z":"8899d0af1bbdfc27","name":"Groq","property":"payload","propertyType":"msg","service":"716812fe2033d25c","method":"createChatCompletion","x":990,"y":860,"wires":[["c238e589b4380c4f"]]},{"id":"b8b1603ad1052cd2","type":"change","z":"8899d0af1bbdfc27","name":"Set topic & payload","rules":[{"t":"set","p":"payload","pt":"msg","to":"objectsList","tot":"flow"},{"t":"set","p":"topic","pt":"msg","to":"Alerte enquêtes publiques","tot":"str"}],"action":"","property":"","from":"","to":"","reg":false,"x":550,"y":860,"wires":[["44be0301445794e1"]]},{"id":"3ed9c212b8b5a5fb","type":"inject","z":"8899d0af1bbdfc27","name":"","props":[{"p":"payload"},{"p":"topic","vt":"str"}],"repeat":"","crontab":"","once":false,"onceDelay":0.1,"topic":"","payload":"","payloadType":"date","x":250,"y":860,"wires":[["b8b1603ad1052cd2"]]},{"id":"c238e589b4380c4f","type":"change","z":"8899d0af1bbdfc27","name":"Get msg content","rules":[{"t":"set","p":"payload","pt":"msg","to":"payload.choices[0].message.content","tot":"msg"}],"action":"","property":"","from":"","to":"","reg":false,"x":1170,"y":860,"wires":[["64b7434b0f7f3c13","2d4cc3fb10088241"]]},{"id":"64b7434b0f7f3c13","type":"debug","z":"8899d0af1bbdfc27","name":"Message","active":false,"tosidebar":true,"console":false,"tostatus":false,"complete":"true","targetType":"full","statusVal":"","statusType":"auto","x":1380,"y":800,"wires":[]},{"id":"d3e610275fee3e54","type":"comment","z":"8899d0af1bbdfc27","name":"Generate email content and send email","info":"","x":250,"y":780,"wires":[]},{"id":"33686a7aeeee4b4e","type":"comment","z":"8899d0af1bbdfc27","name":"In case of error... notify me on HA","info":"","x":230,"y":980,"wires":[]},{"id":"c759c58bdd1deac9","type":"api-call-service","z":"8899d0af1bbdfc27","name":"Notif error HA","server":"2f335157.a6fd8e","version":5,"debugenabled":false,"domain":"notify","service":"mobile_app_iphone","areaId":[],"deviceId":[],"entityId":[],"data":"{\"title\":\"Erreur API public inquiries\",\"message\":\"{{error.source.name}}, {{error.message}}\"}","dataType":"json","mergeContext":"","mustacheAltTags":false,"outputProperties":[],"queue":"none","x":840,"y":1060,"wires":[[]]},{"id":"696d2a38cd42e110","type":"comment","z":"8899d0af1bbdfc27","name":"Get Google Distance Matrix API","info":"","x":230,"y":640,"wires":[]},{"id":"635591882f1252e1","type":"comment","z":"8899d0af1bbdfc27","name":"End Loop","info":"","x":1060,"y":720,"wires":[]},{"id":"716812fe2033d25c","type":"Service Host","apiBase":"https://api.groq.com/openai/v1/","secureApiKeyHeaderOrQueryName":"Authorization","organizationId":"","name":"api.groq.com"},{"id":"2f335157.a6fd8e","type":"server","name":"Home Assistant","version":5,"addon":true,"rejectUnauthorizedCerts":true,"ha_boolean":"y|yes|true|on|home|open","connectionDelay":true,"cacheJson":true,"heartbeat":false,"heartbeatInterval":30,"areaSelector":"friendlyName","deviceSelector":"friendlyName","entitySelector":"friendlyName","statusSeparator":"at: ","statusYear":"hidden","statusMonth":"short","statusDay":"numeric","statusHourCycle":"h23","statusTimeFormat":"h:m","enableGlobalContextStore":true}]