Move all scripts to 'src' directory

focloir_wotd]: Add focloir_wotd.pl
[teanglann]: mv 'teanglann_word_of_the_day.pl' teanglann_wotd.pl
2025-01-09 21:30:48 +00:00 · 2024-11-11 01:57:52 +00:00 · 2024-11-08 14:08:09 +00:00 · 2024-11-08 02:40:38 +00:00 · 2024-07-15 15:05:23 +01:00 · 2024-07-15 15:04:26 +01:00
5 changed files with 160 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,20 @@
 # custom_rss_generators
 A collection of scripts that generate custom RSS feeds
+
+The scripts in this repository do not by default generate output files,
+instead only outputting to `stdout`.
+This is quite elegant if your RSS feed reader supports reading in from
+`stdout` as it avoid the generation of intermediary files.
+For example, in my [`newsraft`](https://codeberg.org/newsraft/newsraft) `feeds`
+configuration file, I have the following lines:
+```
+@ Events
+$(python3 ~/code/python/custom_rss_generators/roisin_dubh_listings.py 2>/dev/null)  "  Róisín Dubh Event Listings"
+```
+
+However, if your RSS feed reader does not support reading in from `stdout` or running 
+executables to generate feeds, you will need to find another way to utilise these scripts, e.g.
+generating `rss.xml` files on a cron schedule and reading from those in your feed reader by using 
+the a link beginning with `file://` or if possible, doing some kind of command injection attack
+via your configuration file to force your feed reader to execute the script (although if successful 
+& not a deliberate design feature this likely indicates a security issue with your feed reader).
--- a/src/focloir_wotd.pl
+++ b/src/focloir_wotd.pl
@ -0,0 +1,42 @@
+#!/usr/bin/perl
+# Script to scrape the Irish dictionary website focloir.ie and turn the "Word of the Day" into an RSS item
+use strict;
+use warnings;
+use HTML::TreeBuilder;
+use Encode;
+
+# use UTF-8 when writing to STDOUT
+binmode(STDOUT, ":encoding(utf8)"); 
+
+my $url = "https://www.focloir.ie/en/";
+my $user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"; # focloir.ie blocks curl and wget without this
+my $html = `curl --user-agent "$user_agent" "$url"`;
+
+my $tree = HTML::TreeBuilder->new;
+$tree->parse(decode('UTF-8', $html));
+$tree->eof;
+
+my $div   = $tree->look_down(_tag => "div", class => "wotdEntry");
+my $a_tag = $div->look_down(_tag => "div", class => "wotdEntryHdr")->look_down(_tag => "a");
+
+my $word  = $a_tag->look_down(_tag => "span")->as_text;
+my $link  = $a_tag->attr("href");
+my $entry = $div->look_down(_tag => "div", class => "wotdEntryBody")->as_text;
+
+print("
+<rss xmlns:atom='http://www.w3.org/2005/Atom' version='2.0'>
+<channel>
+    <title>Focloir.ie: Focal an Lae</title>
+    <link>" . $url . "</link>
+
+    <item>
+        <title>" . $word . "</title>
+        <link>" . $link . "</link>
+        <pubDate>" . `date "+%a, %d %b %Y %H:%M:%S %z"` . "</pubDate>
+        <description><![CDATA[" . $word . "<br>" . $entry . "]]></description>
+    </item>
+</channel>
+</rss>
+");
+
+$tree = $tree->delete;
--- a/src/roisin_dubh_listings.pl
+++ b/src/roisin_dubh_listings.pl
@ -0,0 +1,58 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+use utf8;
+use JSON;
+use Date;
+use Date::Parse;
+
+# use UTF-8 when writing to STDOUT
+binmode(STDOUT, ":encoding(utf8)");
+
+sub yes_or_no {
+    my ($boolean) = @_;
+    return($boolean eq "1" ? "Yes" : "No");
+}
+
+my $listings = decode_json(`curl "https://roisindubh.net/remote/searchlistings.json"`)->{results};
+
+print("
+<rss xmlns:atom='http://www.w3.org/2005/Atom' version='2.0'>
+<channel><title>Róisín Dubh Listings</title><link>https://roisindubh.net/listings/</link>");
+
+foreach my $listing (@$listings) {
+    my $event_date = str2time($listing->{event_date_time});
+
+    # only print data if event data is in the future
+    if ($event_date > Date::now()) {
+        print("
+        <item>
+            <title><![CDATA[" . $listing->{pagetitle} . "]]></title>
+            <link>https://roisindubh.net/listings/" . $listing->{alias} . "</link>
+            <pubDate>" . Date::strftime("%a, %d %b %Y %H:%M:%S %z", $event_date) . "</pubDate>
+
+            <description>
+                <![CDATA[
+                    " . $listing->{introtext} . "
+
+                    " . $listing->{content} . "
+
+                Location: " . $listing->{name} . "<br>
+                Event start time: " . Date::strftime("%Y-%m-%d %a %H:%M:%S", $event_date) . "<br>
+                Late night?: " . yes_or_no($listing->{late_night}) . "<br>
+                Postponed?: " . yes_or_no($listing->{postponed}) . "<br>
+                <br>
+                Ticket Price: €" . $listing->{prices}->{regular} . "<br>
+                Ticket Allocation: " . $listing->{ticket_allocation} . "<br>
+                Tickets remaining?: " . yes_or_no($listing->{ticket_remaining}) . "<br>
+                <br>
+                Sales start time: " . Date::strftime("%Y-%m-%d %a %H:%M:%S", str2time($listing->{sales_start})) . "<br>
+                On Sale?: " . yes_or_no($listing->{on_sale}) . "<br>
+                ]]>
+            </description>
+        </item>
+        ");
+    }
+}
+print("</channel></rss>");
--- a/src/roisin_dubh_listings.py
+++ b/src/roisin_dubh_listings.py
@ -16,7 +16,7 @@ for listing in listings:
        continue

    print('<item>')
-    print('<title>' + listing['pagetitle'] + '</title>')
+    print('<title><![CDATA[' + listing['pagetitle'] + ']]></title>')
    print('<link>https://roisindubh.net/listings/' + listing['alias'] + '</link>')

    print('<description> <![CDATA[')
--- a/src/teanglann_wotd.pl
+++ b/src/teanglann_wotd.pl
@ -0,0 +1,41 @@
+#!/usr/bin/perl
+# Script to scrape the Irish dictionary website teanglann.ie and turn the "Word of the Day" into an RSS item
+use strict;
+use warnings;
+use HTML::TreeBuilder;
+use Encode;
+
+# use UTF-8 when writing to STDOUT
+binmode(STDOUT, ":encoding(utf8)"); 
+
+my $url  = "https://www.teanglann.ie/en/";
+my $html = `curl "$url"`;
+
+my $tree = HTML::TreeBuilder->new;
+$tree->parse(decode('UTF-8', $html));
+$tree->eof;
+
+my $div   = $tree->look_down(_tag => "div",  class => "wod");
+my $a_tag =  $div->look_down(_tag => "a",    class => "headword");
+my $entry =  $div->look_down(_tag => "span", class => "entry");
+
+my $word  = $a_tag->as_text;
+my $link  = $a_tag->attr("href");
+
+print("
+<rss xmlns:atom='http://www.w3.org/2005/Atom' version='2.0'>
+<channel>
+    <title>Teanglann.ie: Focal an Lae</title>
+    <link>" . $url . "</link>
+
+    <item>
+        <title>" . $word . "</title>
+        <link>https://www.teanglann.ie" . $link . "</link>
+        <pubDate>" . `date "+%a, %d %b %Y %H:%M:%S %z"` . "</pubDate>
+        <description><![CDATA[" . $entry->as_text . "]]></description>
+    </item>
+</channel>
+</rss>
+");
+
+$tree = $tree->delete;
Author	SHA1	Message	Date
Andrew	a2075e8363	Move all scripts to 'src' directory	2025-01-09 21:30:48 +00:00
Andrew	147d81e546	focloir_wotd]: Add focloir_wotd.pl	2024-11-11 01:57:52 +00:00
Andrew	e83f513313	[teanglann]: mv 'teanglann_word_of_the_day.pl' teanglann_wotd.pl	2024-11-08 14:08:09 +00:00
Andrew	c98f5a4d32	[teanglann]: Add teanglann_word_of_the_day.pl	2024-11-08 02:40:38 +00:00
Andrew	441ca95add	Formatting	2024-07-15 15:05:23 +01:00
Andrew	cb15f1ef20	Add <br>s to roisin_dubh_listings.pl	2024-07-15 15:04:26 +01:00
Andrew	c9816cbcea	Make pubDate conform to RSS standard in roisin_dubh_listings.pl Also format dates in item descriptions to my taste	2024-07-15 14:59:33 +01:00
Andrew	e8d0b4818e	Rewrite roisin_dubh_listings in Perl	2024-07-14 20:19:24 +01:00
Andrew	b05e6bb8e0	Update README.md	2024-07-02 22:27:25 +01:00
Andrew	3afb45ff8d	Fix malformed XML in roidin_dubh_listings.py Fix unescaped control characters in some listing titles	2024-07-02 21:45:24 +01:00