From 147d81e54617390d4fa78a431890aa9cce2eb1fb Mon Sep 17 00:00:00 2001 From: Andrew Date: Mon, 11 Nov 2024 01:57:52 +0000 Subject: [PATCH] focloir_wotd]: Add focloir_wotd.pl --- focloir_wotd.pl | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100755 focloir_wotd.pl diff --git a/focloir_wotd.pl b/focloir_wotd.pl new file mode 100755 index 0000000..2e831f3 --- /dev/null +++ b/focloir_wotd.pl @@ -0,0 +1,42 @@ +#!/usr/bin/perl +# Script to scrape the Irish dictionary website focloir.ie and turn the "Word of the Day" into an RSS item +use strict; +use warnings; +use HTML::TreeBuilder; +use Encode; + +# use UTF-8 when writing to STDOUT +binmode(STDOUT, ":encoding(utf8)"); + +my $url = "https://www.focloir.ie/en/"; +my $user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"; # focloir.ie blocks curl and wget without this +my $html = `curl --user-agent "$user_agent" "$url"`; + +my $tree = HTML::TreeBuilder->new; +$tree->parse(decode('UTF-8', $html)); +$tree->eof; + +my $div = $tree->look_down(_tag => "div", class => "wotdEntry"); +my $a_tag = $div->look_down(_tag => "div", class => "wotdEntryHdr")->look_down(_tag => "a"); + +my $word = $a_tag->look_down(_tag => "span")->as_text; +my $link = $a_tag->attr("href"); +my $entry = $div->look_down(_tag => "div", class => "wotdEntryBody")->as_text; + +print(" + + + Focloir.ie: Focal an Lae + " . $url . " + + + " . $word . " + " . $link . " + " . `date "+%a, %d %b %Y %H:%M:%S %z"` . " + " . $entry . "]]> + + + +"); + +$tree = $tree->delete;