Automating Web Scraping, LLM Data Transformation using LLaMA 3.1 on groq.com

Overview:

  1. Web Scraping:

    • The flow begins with a scheduled trigger every Saturday at 7:00 AM. It scrapes a webpage and extracts the body text.
  2. Data Cleaning and Transformation:

    • The scraped HTML content is cleaned and transformed into JSON objects. This is done using a LLM (LLaMA 3.1 via Groq), which formats the raw data into structured JSON.
  3. Distance Calculation:

    • The JSON objects are looped through, and for each object, the address is set, and parameters are configured for the Google Distance Matrix API.
    • The API calculates the distance for each object, and only those objects that meet specific criteria are retained.
  4. HTML Email Generation:

    • Once the relevant objects are filtered, the flow generates an HTML email content using LLaMA 3.1 again and sends the email via a configured SMTP node.
  5. Error Handling:

    • The flow includes error handling logic to catch issues related to the Google API and notify via a connected Home Assistant instance if something goes wrong.

Key Components:

  • LLaMA 3.1: Used twice in the flow, once for transforming the scraped data into JSON, and again for generating the HTML email content.
  • Google Distance Matrix API: Responsible for calculating distances, helping filter objects based on geographic relevance.
  • Node-RED: Orchestrating the entire process, from scraping to email dispatch, with clean error handling.

This flow demonstrates how Node-RED can be leveraged to create sophisticated workflows that combine external APIs, advanced AI models, and robust automation features.

[{"id":"8899d0af1bbdfc27","type":"tab","label":"Permis construire alerte","disabled":false,"info":""},{"id":"d0d50336b4cb2c50","type":"inject","z":"8899d0af1bbdfc27","name":"Samedi 7h00","props":[{"p":"payload"},{"p":"topic","vt":"str"}],"repeat":"","crontab":"00 07 * * 6","once":false,"onceDelay":0.1,"topic":"","payload":"","payloadType":"date","x":140,"y":100,"wires":[["e894160976c800e6"]]},{"id":"e894160976c800e6","type":"http request","z":"8899d0af1bbdfc27","name":"Scrap web page","method":"GET","ret":"txt","paytoqs":"ignore","url":"https://www.neuchatelville.ch/enquetes-publiques","tls":"","persist":false,"proxy":"","insecureHTTPParser":false,"authType":"","senderr":false,"headers":[],"x":380,"y":120,"wires":[["aa223496fbecbe26"]]},{"id":"aa223496fbecbe26","type":"cheerio-function","z":"8899d0af1bbdfc27","name":"ce-bodytext","func":"//var nextAll = $('div.ce-bodytext').eq(1).nextAll();\n//var bodytext = ${nextAll.text()};\n\n// @ts-ignore\nmsg.payload = $('div.ce-bodytext').eq(1);\n\n\n//*[@id=\"c14948\"]/div/div\nreturn msg;","outputs":1,"noerr":0,"x":570,"y":120,"wires":[["ba397d4b29263c74"]]},{"id":"a77124e989d217a6","type":"OpenAI API","z":"8899d0af1bbdfc27","name":"Groq","property":"payload","propertyType":"msg","service":"716812fe2033d25c","method":"createChatCompletion","x":650,"y":260,"wires":[["9c009603ee4a68bb","bb23e3241e1a1ff5"]]},{"id":"9c009603ee4a68bb","type":"debug","z":"8899d0af1bbdfc27","name":"msg content","active":false,"tosidebar":true,"console":false,"tostatus":false,"complete":"payload.choices[0].message.content","targetType":"msg","statusVal":"","statusType":"auto","x":870,"y":260,"wires":[]},{"id":"cb6783146b04907c","type":"change","z":"8899d0af1bbdfc27","name":"prompt & model setting","rules":[{"t":"set","p":"payload","pt":"msg","to":"{\t   \"messages\":[\t       {\t           \"role\":\"system\",\t           \"content\":\"You generate output in JSON format only, with no further explanatory text. Generate a JSON with this structure : {\\\"enquetes\\\":[{\\\"dossier_number\\\":\\\"116183\\\",\\\"city\\\":\\\"Corcelles - Cormondrèche\\\",\\\"situation\\\":\\\"Rue de la Chapelle 14, Corcelles - Cormondrèche\\\",\\\"description\\\":\\\"Transformation du 1er étage et création d'un studio au rez-de-chaussée\\\",\\\"requerant\\\":\\\"Pierre-Yves Besancet\\\",\\\"auteur\\\":\\\"André Erard\\\",\\\"plans\\\":\\\"MONNIER-ERARD SA ARCHITECTES SIA DIPLOMES EPFL, 2000 NEUCHATEL\\\",\\\"distance\\\":\\\"\\\"}]}. I want an \\\"city\\\" element, whiche takes infos from \\\"Situation\\\" and  from \\\"Parcelle(s) et coordonnées\\\", but only after the number and -. Exemple : from the text : Parcelle(s) et coordonnées: 4412 - Corcelles - Cormondrèche ; 4413/A - Corcelles - Cormondrèche ; I want only : Corcelles - Cormondrèche. Or another exemple: from the text : Parcelle(s) et coordonnées: 15604,14206 - Neuchâtel ; I want only : Neuchâtel. Add an empty filed \\\"distance\\\". Remove all object that are not from cities : Corcelles - Cormondrèche, or Neuchâtel. Keep only object from these two cities. \"\t       },\t       {\t           \"role\":\"user\",\t           \"content\":\"Generate an output JSON base on the following HTML : \\n***\" & payload &\"***\"\t       }\t   ],\t   \"model\":\"llama-3.1-70b-versatile\",\t   \"stream\":false\t}","tot":"jsonata"}],"action":"","property":"","from":"","to":"","reg":false,"x":410,"y":260,"wires":[["a77124e989d217a6"]]},{"id":"ba397d4b29263c74","type":"function","z":"8899d0af1bbdfc27","name":"clean html","func":"\n// transform as a string\nmsg.payload= msg.payload.toString('utf-8')\n\nmsg.payload = msg.payload.replace('<div class=\"ce-bodytext\">', '<div>');\n\n//.replace(' class=ce-bodytext', '');\n\nreturn msg;","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":750,"y":120,"wires":[["cb6783146b04907c"]]},{"id":"ba5ae90d07203061","type":"json","z":"8899d0af1bbdfc27","name":"","property":"payload","action":"","pretty":true,"x":650,"y":320,"wires":[["1a4f8e22de6670be","25067bfa6ea24dd4"]]},{"id":"87488059aaf01c86","type":"comment","z":"8899d0af1bbdfc27","name":"Transofrm in a JSON objects","info":"","x":920,"y":200,"wires":[]},{"id":"1a4f8e22de6670be","type":"debug","z":"8899d0af1bbdfc27","name":"clean json","active":false,"tosidebar":true,"console":false,"tostatus":false,"complete":"true","targetType":"full","statusVal":"","statusType":"auto","x":860,"y":320,"wires":[]},{"id":"25067bfa6ea24dd4","type":"loop","z":"8899d0af1bbdfc27","name":"","kind":"enum","count":"","initial":"1","step":"1","condition":"","conditionType":"js","when":"before","enumeration":"payload.enquetes","enumerationType":"msg","limit":"","loopPayload":"loop-index","finalPayload":"final-count","x":610,"y":480,"wires":[["719b2795a011cfea","b8b1603ad1052cd2"],["aa99facb435a05af"]]},{"id":"719b2795a011cfea","type":"debug","z":"8899d0af1bbdfc27","name":"end of Loop content","active":false,"tosidebar":true,"console":false,"tostatus":false,"complete":"true","targetType":"full","statusVal":"","statusType":"auto","x":900,"y":480,"wires":[]},{"id":"bb23e3241e1a1ff5","type":"change","z":"8899d0af1bbdfc27","name":"Get msg content","rules":[{"t":"set","p":"payload","pt":"msg","to":"payload.choices[0].message.content","tot":"msg"}],"action":"","property":"","from":"","to":"","reg":false,"x":390,"y":320,"wires":[["ba5ae90d07203061"]]},{"id":"aa99facb435a05af","type":"function","z":"8899d0af1bbdfc27","name":"Set address","func":"// Check if the structure exists\nif (msg.loop && msg.loop.value && msg.loop.value.situation && msg.loop.value.city) {\n    // Log the city value using node.warn\n    node.warn(\"Address value: \" + msg.loop.value.situation + \", \" + msg.loop.value.city);\n    var address = msg.loop.value.situation + \", \" + msg.loop.value.city;\n    flow.set('address', address);\n    node.warn(\"address new \" + context.get('address'));\n\n} else {\n    // Log a warning if the city or sitation is not found\n    node.warn(\"Adrress not found in the message\");\n}\n\nreturn msg;","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":650,"y":560,"wires":[["db70b4270c628744"]]},{"id":"db70b4270c628744","type":"function","z":"8899d0af1bbdfc27","name":"Set API parameters","func":"\n// distancematrix/json?key=<yourapiKey>&destinations=<your-destination>&origins=<your-origin>&units=metric\n\nvar key = \"you key\";\nvar units = \"metric\";\n// Exemple :\n//var destinations = \"Your Address\"; \nvar destinations = flow.get('address')||0;\n\n// print in console the destination\n//node.warn(\"New destination : \" + destinations);\nif(msg.loop.value.city.includes(\"Corcelles\")) {\n    var origins = \"my origin\";\n} else {\n    var origins = \"Other origin\";\n}\n\n\nmsg.payload = {\n    origins: origins,\n    destinations: destinations,\n    key: key\n};\n\nreturn msg;\n\n","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":730,"y":600,"wires":[["04369f17d2013e5c"]]},{"id":"04369f17d2013e5c","type":"http request","z":"8899d0af1bbdfc27","name":"Get distance","method":"GET","ret":"obj","paytoqs":"query","url":"https://maps.googleapis.com/maps/api/distancematrix/json","tls":"","persist":false,"proxy":"","insecureHTTPParser":false,"authType":"","senderr":false,"headers":[],"x":790,"y":640,"wires":[["0ce3b2eda780fcf7","c455c0ef674c44fa"]]},{"id":"0ce3b2eda780fcf7","type":"debug","z":"8899d0af1bbdfc27","name":"Distance m","active":false,"tosidebar":true,"console":false,"tostatus":false,"complete":"payload.rows[0].elements[0].distance.value","targetType":"msg","statusVal":"","statusType":"auto","x":1370,"y":640,"wires":[]},{"id":"c455c0ef674c44fa","type":"function","z":"8899d0af1bbdfc27","name":"keeps only object of interest add to objecstList","func":"// Initialize objectsList as an empty object if it does not exist\nvar objectsList = flow.get('objectsList') || {};\n\n// Extracting values from loop.value\nvar dossier_number = msg.loop.value.dossier_number;\nvar city = msg.loop.value.city;\nvar situation = msg.loop.value.situation;\nvar description = msg.loop.value.description;\nvar requerant = msg.loop.value.requerant;\nvar auteur = msg.loop.value.auteur;\nvar plans = msg.loop.value.plans || \"\";  // Valeur par défaut si non définie\nvar distance = msg.payload.rows[0].elements[0].distance.value;  // Distance from message\n\n// Initialize object with extracted values\nvar newObject = {\n    \"dossier_number\": dossier_number,\n    \"city\": city,\n    \"situation\": situation,\n    \"description\": description,\n    \"requerant\": requerant,\n    \"auteur\": auteur,\n    \"plans\": plans,\n    \"distance\": \"\"  // Distance initialized as an empty string\n};\n\n// Add distance only if less than 500\nif (distance < 500) {\n    newObject.distance = distance.toString();  // Distance update\n}\n\n// Stores newObject in objectsList under a unique key\nobjectsList[dossier_number] = newObject;  // Using dossier_number as a unique key\n\n// Saving updated objectsList in flow context\nflow.set('objectsList', objectsList);\n\nreturn msg;","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[],"x":940,"y":680,"wires":[["25067bfa6ea24dd4"]]},{"id":"b4505f38e96ec86b","type":"catch","z":"8899d0af1bbdfc27","name":"","scope":["04369f17d2013e5c","2d4cc3fb10088241"],"uncaught":false,"x":620,"y":1040,"wires":[["4efe6e067790ffbe","c759c58bdd1deac9"]]},{"id":"4efe6e067790ffbe","type":"debug","z":"8899d0af1bbdfc27","name":"debug google API","active":true,"tosidebar":true,"console":false,"tostatus":false,"complete":"true","targetType":"full","statusVal":"","statusType":"auto","x":850,"y":1020,"wires":[]},{"id":"d5543019cd3673b3","type":"comment","z":"8899d0af1bbdfc27","name":"Loop to JSON objects","info":"","x":200,"y":480,"wires":[]},{"id":"2d4cc3fb10088241","type":"e-mail","z":"8899d0af1bbdfc27","server":"smtp.server.com","port":"465","authtype":"BASIC","saslformat":true,"token":"oauth2Response.access_token","secure":true,"tls":true,"name":"[email protected]","dname":"Mail server","x":1410,"y":860,"wires":[]},{"id":"44be0301445794e1","type":"change","z":"8899d0af1bbdfc27","name":"prompt & model setting","rules":[{"t":"set","p":"payload","pt":"msg","to":"{\t   \"messages\":[\t       {\t           \"role\":\"system\",\t           \"content\":\"Tu es un assistant de rédaction d’email en html. Tu recevras un JSON contenant 1 ou plusieurs objets d’enquêtes publiques provenant du site de la ville de Neuchâtel: https://www.neuchatelville.ch/enquetes-publiques. Rédige un email court, en informant des enquêtes publiques à proximité. Puis, liste de manière lisible chaque objet en texte sous forme de liste mise en page. Sépare chaque objet par un espace. Exemple de liste des objets : ***Voici la liste des enquêtes publiques à proximité de chez nous :\\n• <b>Dossier 116978</b> : Installation d'un refroidisseur en toiture. \\nSitué à : Neuchâtel, Rue de Vieux-Châtel 22. \\nRequérant : Antoine Savary RHNe, 2000 Neuchâtel. \\nAuteur : Antoine Savary RHNe, Maladière 45, 2000 NEUCHATEL.\\n\\n\\n*** Ce mail est envoyé aux membres de la famille. Ecris en français. Retourne uniquement le contenu (body) de l'email, n’écris pas d’introduction, ni objet. Commence par : Bonjour à tous, . Renseigne le site internet afin que nous puissions le consulter et que le lien soit actif. En fin de mail, retour à la ligne, tu signes 'le petit robot automatique' avec un emoji. Formate le contenu en html pour email en ajoutant les retours à la ligne et des paragraphe <p>, Utilisez <!DOCTYPE html> pour indiquer à votre programme que vous écrivez un code HTML.\"\t       },\t       {\t           \"role\":\"user\",\t           \"content\":\"L'objet JSON des enquêtes publiques : \\n***\" & payload &\"***\\n retourne moi que le contenu de l'email.\"\t       }\t   ],\t   \"model\":\"llama-3.1-70b-versatile\",\t   \"stream\":false\t}","tot":"jsonata"}],"action":"","property":"","from":"","to":"","reg":false,"x":790,"y":860,"wires":[["9282f0c90819e265"]]},{"id":"9282f0c90819e265","type":"OpenAI API","z":"8899d0af1bbdfc27","name":"Groq","property":"payload","propertyType":"msg","service":"716812fe2033d25c","method":"createChatCompletion","x":990,"y":860,"wires":[["c238e589b4380c4f"]]},{"id":"b8b1603ad1052cd2","type":"change","z":"8899d0af1bbdfc27","name":"Set topic & payload","rules":[{"t":"set","p":"payload","pt":"msg","to":"objectsList","tot":"flow"},{"t":"set","p":"topic","pt":"msg","to":"Alerte enquêtes publiques","tot":"str"}],"action":"","property":"","from":"","to":"","reg":false,"x":550,"y":860,"wires":[["44be0301445794e1"]]},{"id":"3ed9c212b8b5a5fb","type":"inject","z":"8899d0af1bbdfc27","name":"","props":[{"p":"payload"},{"p":"topic","vt":"str"}],"repeat":"","crontab":"","once":false,"onceDelay":0.1,"topic":"","payload":"","payloadType":"date","x":250,"y":860,"wires":[["b8b1603ad1052cd2"]]},{"id":"c238e589b4380c4f","type":"change","z":"8899d0af1bbdfc27","name":"Get msg content","rules":[{"t":"set","p":"payload","pt":"msg","to":"payload.choices[0].message.content","tot":"msg"}],"action":"","property":"","from":"","to":"","reg":false,"x":1170,"y":860,"wires":[["64b7434b0f7f3c13","2d4cc3fb10088241"]]},{"id":"64b7434b0f7f3c13","type":"debug","z":"8899d0af1bbdfc27","name":"Message","active":false,"tosidebar":true,"console":false,"tostatus":false,"complete":"true","targetType":"full","statusVal":"","statusType":"auto","x":1380,"y":800,"wires":[]},{"id":"d3e610275fee3e54","type":"comment","z":"8899d0af1bbdfc27","name":"Generate email content and send email","info":"","x":250,"y":780,"wires":[]},{"id":"33686a7aeeee4b4e","type":"comment","z":"8899d0af1bbdfc27","name":"In case of error... notify me on HA","info":"","x":230,"y":980,"wires":[]},{"id":"c759c58bdd1deac9","type":"api-call-service","z":"8899d0af1bbdfc27","name":"Notif error HA","server":"2f335157.a6fd8e","version":5,"debugenabled":false,"domain":"notify","service":"mobile_app_iphone","areaId":[],"deviceId":[],"entityId":[],"data":"{\"title\":\"Erreur API public inquiries\",\"message\":\"{{error.source.name}}, {{error.message}}\"}","dataType":"json","mergeContext":"","mustacheAltTags":false,"outputProperties":[],"queue":"none","x":840,"y":1060,"wires":[[]]},{"id":"696d2a38cd42e110","type":"comment","z":"8899d0af1bbdfc27","name":"Get Google Distance Matrix API","info":"","x":230,"y":640,"wires":[]},{"id":"635591882f1252e1","type":"comment","z":"8899d0af1bbdfc27","name":"End Loop","info":"","x":1060,"y":720,"wires":[]},{"id":"716812fe2033d25c","type":"Service Host","apiBase":"https://api.groq.com/openai/v1/","secureApiKeyHeaderOrQueryName":"Authorization","organizationId":"","name":"api.groq.com"},{"id":"2f335157.a6fd8e","type":"server","name":"Home Assistant","version":5,"addon":true,"rejectUnauthorizedCerts":true,"ha_boolean":"y|yes|true|on|home|open","connectionDelay":true,"cacheJson":true,"heartbeat":false,"heartbeatInterval":30,"areaSelector":"friendlyName","deviceSelector":"friendlyName","entitySelector":"friendlyName","statusSeparator":"at: ","statusYear":"hidden","statusMonth":"short","statusDay":"numeric","statusHourCycle":"h23","statusTimeFormat":"h:m","enableGlobalContextStore":true}]

Flow Info

Created 8 months ago
Rating: 5 1

Actions

Rate:

Node Types

Core
  • catch (x1)
  • change (x5)
  • comment (x6)
  • debug (x6)
  • function (x4)
  • http request (x2)
  • inject (x2)
  • json (x1)
Other

Tags

  • OpenAI
  • LLama
  • groq
  • Google
  • datamatrix
  • api
  • email
  • scrap
  • web
Copy this flow JSON to your clipboard and then import into Node-RED using the Import From > Clipboard (Ctrl-I) menu option