#!/usr/bin/perl -w
#
# Version 5
# changed to view file as stream. see
# http://cgi.kript.net/blosxom.cgi/2007/03/09#GrabRSSEnclosures3
# Version 4
# changed script to scan for files ending in mp3 rather than the enclosure tag
# Version 3
# only download's files if it hasn't seen them before, and they sucessfully
#  downloaded last time, if not keep trying (for aborted downloads).
# Version2
# added proper temptfile code
# Version1
# initial version

use strict;
use XML::RSS::Parser::Lite;
use LWP::Simple;
use MLDBM 'DB_File';
use File::Temp ();
use URI::Escape;
 
my ($item_url, $url, $rss, $xml, $temp_file, $filename_base);
my ($state_file, $download_file, $temp_handle, $clean_xml);
my (@enclosures_to_grab);
my (%seen);

#Initialise Variables
$url = $ARGV[0] || die 
	"Useage: $0 <url>\nWhere URL is the rss 2.0 feed you want to grab the enclosures of\n";

#routine to download the mp3/whatever from the enclosure
sub RetrieveFile
{
	my $download_url = shift;
	my $download_file = shift;
	if ( getstore($download_url, $download_file) )
	{
		return 1;
	}
	else
	{
		print "\nCannot download $download_file from $download_url because: $!";
		return 0;
	}

}

#####################
## Main code starts here
#####################

#setup the tempfile - this will be removed automatically when script finishes
$temp_handle = new File::Temp();
$temp_file = $temp_handle->filename;


#retrieve the feed or die gracefully
getstore($url, $temp_file);
#slurp in the file - and close again so we can read it again later
# this is so we only read the RSS file once for each script run 
# good behaviour, in other words
{
    local( $/, *FH ) ;
    open( FH, $temp_file ) or die "can't access $temp_file: $!\n";
    $xml = <FH>;
    close (FH);
    
    #replace the line endings with sane ones
    while ($xml =~ s/\r/\n/g)
	{
    }
}

#$xml = get($url) or die "failed to get $url with error: $!";
#
$rss = XML::RSS::Parser::Lite->new();
$rss->parse($xml);

##make the filename & sanitise
$filename_base = $rss->get('title');
#replace any spaces with underscores
$filename_base =~ s/ /_/g;
chomp($filename_base);
my $verysafe = uri_escape($filename_base);
$filename_base = $verysafe;
$state_file = $filename_base . ".db";


#if we've run this before, get the url's we've already processed
tie (%seen, 'MLDBM', $state_file) or die $!;

#tell the user what we're doing
print "=====\nFeed Processing for " . $rss->get('title') . "\n";


#grep for the <enclosure> line
while ($xml =~ m/url="(.*)"/gi)
{
		#then extract the url to the mp3/mp4
		my $enclosure = $1; 
		#add it to a list of enclosures we've seen and don't need to
		# process again
		unless ( exists($seen{$enclosure}) )
		{
			#extract the filename from the url path			
			my @url_path = split(/\//,$enclosure);
			$download_file = pop @url_path;
			
			#try to download the file.  Only add the enclosure to the
			# 'seen' hash if we sucessfully do so
			print "Retrieve $download_file\n";
			if ( RetrieveFile($enclosure, $download_file) ) 
			{
				$seen{$enclosure} = $enclosure;
			}
		}
}


#save the hash for later checking
untie %seen;

=pod
=head1 GrabRSSEnclosures3.pl

useage: B<GrabRSSEnclosures3.pl> <RSS URL>

Takes a RSS feed url, downloads it and searches for enclosure tags, extracts the url, and attempts to download each.  If it does sucessfully download the file, it adds it to a MLDB hash (saved with the same name as the feed's name), so that it doesn't try to download it again.

Written by john@kript.net.  Licensed under the GPL.

Check for updates at http://cgi.kript.net/blosxom.cgi/code/

Download the latest version; http://www.kript.net/perl/GrabRSSEnclosures3.pl

=cut
