[Bug-gnupod] Several fixes and features for podcast handling

H. Langos Sun, 13 Apr 2008 06:52:50 -0700

Hi there,

I finally managed to sit down, import the gnupod cvs into a git
repository and make some nice clean patches to fix some problems 
and to add some features.


You should be able to apply them without stipping the comments. "patch"
is smart enough to ignore them.

Some of this I've already send to some people and some I've only
mentioned. I know, "Talk is cheap, show me the code." so here it is.
Have fun and don't bug adrian about it if makes your computer eat 
your homework or rape your cat.

cheers
-henrik

PS: I'm still working on artwork support for podcasts and the last
version I tried realy did show the podcasts image... but it ate all the
other artwork of every other file that I had on my ipod. So there's
still room for improvement :-)

commit 4fd74c9375de0512ec6a6c7a06d4a0955a4f3d8c
Author: Heinrich Langos <[EMAIL PROTECTED]>
Date:   Fri Apr 11 22:31:52 2008 +0200

    Added XML-escaping of single quote.

diff --git a/src/ext/XMLhelper.pm b/src/ext/XMLhelper.pm
index b972603..0f59770 100755
--- a/src/ext/XMLhelper.pm
+++ b/src/ext/XMLhelper.pm
@@ -117,6 +117,7 @@ sub xescaped {
 	my ($ret) = @_;
 	$ret =~ s/&/&amp;/g;
 	$ret =~ s/"/&quot;/g;
+	$ret =~ s/\'/&apos;/g;
 	$ret =~ s/</&lt;/g;
 	$ret =~ s/>/&gt;/g;
 	#$ret =~ s/^\s*-+//g;

commit 00d14bb42a89f0e95fa12110ead64416e8bb063e
Author: Heinrich Langos <[EMAIL PROTECTED]>
Date:   Fri Apr 11 22:39:42 2008 +0200

    Improved error reporting on parsing invalid GNUtunesDB.xml

diff --git a/src/ext/XMLhelper.pm b/src/ext/XMLhelper.pm
index 0f59770..5eaeb48 100755
--- a/src/ext/XMLhelper.pm
+++ b/src/ext/XMLhelper.pm
@@ -292,8 +292,14 @@ sub mkh {
 sub doxml {
 	my($xmlin, %opts) = @_;
 	return undef unless (-r $xmlin);
-	my $p = new XML::Parser(Handlers=>{Start=>\&eventer});
-	   $p->parsefile($xmlin);
+	my $p;
+	my $ref = eval {
+		$p = new XML::Parser(ErrorContext => 0, Handlers=>{Start=>\&eventer});
+		$p->parsefile($xmlin);
+	};
+	if($@) {
+		die "An error occurred reading $xmlin :\n", $@ ;
+	}
 	return $p;
 }

commit 8cf37a86ddd8f2d1e3df11d7b31fe31dc0c7dbcc
Author: Heinrich Langos <[EMAIL PROTECTED]>
Date:   Sat Apr 12 02:22:10 2008 +0200

    Fixed podcast XML parser handlers to combine CDATA that comes in chunks.

diff --git a/src/gnupod_addsong.pl b/src/gnupod_addsong.pl
index e1e3a60..0f861fe 100644
--- a/src/gnupod_addsong.pl
+++ b/src/gnupod_addsong.pl
@@ -393,6 +393,7 @@ sub PODCAST_fetch {
 sub podcastStart {
 	my($hr,$el,@it) = @_;
 	my $hashref_key = $hr->{Base};
+	undef($hr->{cdatabuffer});
 	if($hr->{Context}[-2] eq "rss" &&
 	   $hr->{Context}[-1] eq "channel" &&
 		 $el eq "item") {
@@ -412,13 +413,22 @@ sub podcastStart {
 # => Fillsup %podcast_infos
 sub podcastChar {
 	my($hr,$el) = @_;
+	$hr->{cdatabuffer} .= $el;
+}
+
+#############################################################
+#Eventer for END
+# => Fillsup %podcast_infos
+sub podcastEnd {
+	my($hr,$el) = @_;
 	my $hashref_key = $hr->{Base};
-	if($hr->{Context}[-4] eq "rss" &&
-	   $hr->{Context}[-3] eq "channel" &&
-	   $hr->{Context}[-2] eq "item") {
-		my $ccontext = $hr->{Context}[-1];
-		${$podcast_infos{$hashref_key}}[-1]->{$ccontext}->{"\0"} ||= $el;
+	if(defined($hr->{cdatabuffer}) &&
+	   $hr->{Context}[-3] eq "rss" &&
+	   $hr->{Context}[-2] eq "channel" &&
+	   $hr->{Context}[-1] eq "item") {
+		${$podcast_infos{$hashref_key}}[-1]->{$el}->{"\0"} ||= $hr->{cdatabuffer};
 	}
+	undef($hr->{cdatabuffer});
 }
 
 #############################################################
@@ -448,7 +458,7 @@ sub resolve_podcasts {
 			}
 			#Add the stuff to %podcast_infos and unlink the file after this.
 			eval {
-				my $px = new XML::Parser(Handlers=>{Start=>\&podcastStart, Char=>\&podcastChar});
+				my $px = new XML::Parser(Handlers=>{Start=>\&podcastStart, Char=>\&podcastChar, End=>\&podcastEnd});
 				$px->parsefile($pcrss->{file});
 			};
 			warn "! [HTTP] Error while parsing XML: [EMAIL PROTECTED]" if $@;

commit 30bb8f45dce5e0a57205a677a2b7b28996140e15
Author: Heinrich Langos <[EMAIL PROTECTED]>
Date:   Sat Apr 12 02:29:13 2008 +0200

    Added release date to podcasts extracted from the pubDate element.

diff --git a/src/gnupod_addsong.pl b/src/gnupod_addsong.pl
index 0f861fe..a21ac2d 100644
--- a/src/gnupod_addsong.pl
+++ b/src/gnupod_addsong.pl
@@ -29,6 +29,7 @@ use GNUpod::ArtworkDB;
 use Getopt::Long;
 use File::Copy;
 use File::Glob ':glob';
+use Date::Parse;
 
 use constant MEDIATYPE_PODCAST_AUDIO => 4;
 use constant MEDIATYPE_PODCAST_VIDEO => 6;
@@ -487,6 +488,7 @@ sub resolve_podcasts {
 			my $c_title = $podcast_item->{title}->{"\0"};
 			my $c_author = $podcast_item->{author}->{"\0"};
 			my $c_desc  = $podcast_item->{description}->{"\0"};
+			my $c_rdate = $podcast_item->{pubDate}->{"\0"};
 			my $c_url   = $podcast_item->{enclosure}->{url};
 			#We use the URL as GUID if there isn't one...			
 			my $c_guid  = $podcast_item->{guid}->{"\0"} || $c_url;
@@ -517,6 +519,7 @@ sub resolve_podcasts {
 			$per_file_info{$rssmedia->{file}}->{title}       = $c_title   if $c_title;
 			$per_file_info{$rssmedia->{file}}->{artist}      = $c_author  if $c_author;
 			$per_file_info{$rssmedia->{file}}->{desc}        = $c_desc    if $c_desc;
+			$per_file_info{$rssmedia->{file}}->{releasedate} = int(Date::Parse::str2time($c_rdate))+MACTIME  if $c_rdate;
 			
 			push(@files,$rssmedia->{file});
 		}

commit 28000b1b009cb4b218d113bd5afc61fb7f67ca91
Author: Heinrich Langos <[EMAIL PROTECTED]>
Date:   Sat Apr 12 02:36:15 2008 +0200

    Added another parser run on the podcast to extract channel information.
    
    This patch doesn't do anything user visible but forms the basis to
    use channel information in the podcast items.

diff --git a/src/gnupod_addsong.pl b/src/gnupod_addsong.pl
index a21ac2d..6768645 100644
--- a/src/gnupod_addsong.pl
+++ b/src/gnupod_addsong.pl
@@ -35,7 +35,7 @@ use constant MEDIATYPE_PODCAST_AUDIO => 4;
 use constant MEDIATYPE_PODCAST_VIDEO => 6;
 
 use constant MACTIME => GNUpod::FooBar::MACTIME;
-use vars qw(%opts %dupdb_normal %dupdb_lazy %dupdb_podcast $int_count %podcast_infos %per_file_info);
+use vars qw(%opts %dupdb_normal %dupdb_lazy %dupdb_podcast $int_count %podcast_infos %podcast_channel_infos %per_file_info);
 
 print "gnupod_addsong.pl Version ###__VERSION__### (C) Adrian Ulrich\n";
 
@@ -433,6 +433,56 @@ sub podcastEnd {
 }
 
 #############################################################
+#Eventer for START:
+# -> Push array if we found a new item beginning
+# -> Add '<foo bar=barz oink=yak />' stuff to the hash
+# => Fillsup %podcast_channel_infos
+sub podcastChannelStart {
+	my($hr,$el,@it) = @_;
+	my $hashref_key = $hr->{Base};
+	$hr->{cdatabuffer} = undef;
+	if($hr->{Context}[-1] eq "rss" &&
+	   $el eq "channel") {
+		push(@{$podcast_channel_infos{$hashref_key}}, {});
+	}elsif($hr->{Context}[-2] eq "rss" &&
+	   $hr->{Context}[-1] eq "channel" &&
+	   $el ne "item") {
+		if (@it) {
+			my $xref = GNUpod::XMLhelper::mkh($el,@it);
+			${$podcast_channel_infos{$hashref_key}}[-1]->{$el} ||= $xref->{$el};
+		}
+	}
+}
+
+#############################################################
+#Eventer for <foo>CONTENT</foo>
+# => Fillsup %podcast_channel_infos
+sub podcastChannelChar {
+	my($hr,$el) = @_;
+	$hr->{cdatabuffer} .= $el;
+}
+
+#############################################################
+#Eventer for END
+# => Fillsup %podcast_channel_infos
+sub podcastChannelEnd {
+	my($hr,$el) = @_;
+	my $hashref_key = $hr->{Base};
+	if(defined($hr->{cdatabuffer}) &&
+	   $hr->{Context}[-2] eq "rss" &&
+	   $hr->{Context}[-1] eq "channel" &&
+	   $el ne "item") {
+		${$podcast_channel_infos{$hashref_key}}[-1]->{$el}->{"\0"} ||= $hr->{cdatabuffer};
+	}elsif(defined($hr->{cdatabuffer}) &&
+	   $hr->{Context}[-3] eq "rss" &&
+	   $hr->{Context}[-2] eq "channel" &&
+	   $hr->{Context}[-1] ne "item") {
+		${$podcast_channel_infos{$hashref_key}}[-1]->{$hr->{Context}[-1]}->{$el}->{"\0"} ||= $hr->{cdatabuffer};
+	}
+	$hr->{cdatabuffer} = undef; # make sure it doesn't get added to the parent element as well
+}
+
+#############################################################
 # This is the heart of our podcast support
 #
 sub resolve_podcasts {
@@ -461,6 +511,8 @@ sub resolve_podcasts {
 			eval {
 				my $px = new XML::Parser(Handlers=>{Start=>\&podcastStart, Char=>\&podcastChar, End=>\&podcastEnd});
 				$px->parsefile($pcrss->{file});
+				my $py = new XML::Parser(Handlers=>{Start=>\&podcastChannelStart, Char=>\&podcastChannelChar, End=>\&podcastChannelEnd});
+				$py->parsefile($pcrss->{file});
 			};
 			warn "! [HTTP] Error while parsing XML: [EMAIL PROTECTED]" if $@;
 			unlink($pcrss->{file}) or warn "Could not unlink $pcrss->{file}, $!\n";
@@ -482,6 +534,9 @@ sub resolve_podcasts {
 		}
 	}
 
+#	use Data::Dumper;
+#	print Dumper(\%podcast_channel_infos);
+
 	foreach my $key (keys(%podcast_infos)) {
 		my $cref = $podcast_infos{$key};
 		foreach my $podcast_item (@$cref) {

commit 3a78d47e485892d28cf17aba5c237f87528a4ef3
Author: Heinrich Langos <[EMAIL PROTECTED]>
Date:   Sat Apr 12 03:54:48 2008 +0200

    Added "--podcast-cache-dir" option to allow caching of podcast media files.
    
    First I did it to make testing of podcast related features easier but then I realized
    that you could use other software to download the podcasts while your iPod is
    not connected. Reducing the time that you can't use your precious. :-)
    
    When downloading "http://example.com/dir1/file1.mp3"; using "--podcast-cache-dir=/tmp/foo"
    the cache will look for "/tmp/foo/example.com/dir1/file1.mp3" and "/tmp/foo/file1.mp3".
    Only a rough size check is done since most podcasts report a wrong size for their media
    files. Most times about 500-1000 Bytes but sometimes up to 50kb wrong. Currently files
    bigger than expected are accepted from cache while smaller files  are downloaded again.
    I'll change this to allow +/- 5% difference.
    
    The RSS feed's XML files are not cached.
    
    Files in the cache do not expire. So you'll have to cleanup yourself once in a while.

diff --git a/src/gnupod_addsong.pl b/src/gnupod_addsong.pl
index 6768645..3ffec29 100644
--- a/src/gnupod_addsong.pl
+++ b/src/gnupod_addsong.pl
@@ -47,11 +47,11 @@ GetOptions(\%opts, "version", "help|h", "mount|m=s", "decode|x=s", "restore|r",
                    "set-title|t=s", "set-artist|a=s", "set-album|l=s", "set-genre|g=s", "set-rating=i", "set-playcount=i",
                    "set-bookmarkable|b", "set-shuffleskip", "artwork=s",
                    "set-songnum", "playlist|p=s@", "reencode|e=i",
-                   "min-vol-adj=i", "max-vol-adj=i", "playlist-is-podcast", "podcast-files-limit=i", "set-compilation");
+                   "min-vol-adj=i", "max-vol-adj=i", "playlist-is-podcast", "podcast-files-limit=i", "podcast-cache-dir=s", "set-compilation");
 
 GNUpod::FooBar::GetConfig(\%opts, {'decode'=>'s', mount=>'s', duplicate=>'b', model=>'s',
                                    'disable-v1'=>'b', 'disable-v2'=>'b', 'set-songnum'=>'b',
-                                   'min-vol-adj'=>'i', 'max-vol-adj'=>'i', 'automktunes'=>'b', 'podcast-files-limit'=>'i' },
+                                   'min-vol-adj'=>'i', 'max-vol-adj'=>'i', 'automktunes'=>'b', 'podcast-files-limit'=>'i', 'podcast-cache-dir'=>'s' },
                                    "gnupod_addsong");
 
 
@@ -381,11 +381,69 @@ sub newpl {
 # Calls curl to get files
 sub PODCAST_fetch {
 	my($url,$prefix) = @_;
+	print "* [HTTP] Downloading $url ...\n";
 	my $tmpout = GNUpod::FooBar::get_u_path($prefix,"");
 	my $return = system("curl", "-s", "-L", "-o", $tmpout, $url);
 	return{file=>$tmpout, status=>$return};
 }
 
+sub PODCAST_fetch_media {
+	my($url,$prefix,$length) = @_;
+	if ($opts{'podcast-cache-dir'}) {
+	
+		my @cachefilecandidates = ();
+		my $deepcachefile = $opts{'podcast-cache-dir'}."/".PODCAST_get_sane_path_from_url($url , "");
+		push @cachefilecandidates, $deepcachefile  if $deepcachefile;
+		
+		my $flatcachefile = $opts{'podcast-cache-dir'}."/".PODCAST_strictly_sanitze_path_element((split(/\//, $url))[-1], "cachefile");
+		push @cachefilecandidates, $flatcachefile;
+
+		foreach my $cachefile (@cachefilecandidates) {
+			if ( -e $cachefile && -r $cachefile && $length > (stat($cachefile))[7] ) {
+				my $sizedelta = int($length) - int((stat($cachefile))[7]) ;
+				print "* [HTTP] Not using cached file $cachefile ... (size:".(stat($cachefile))[7]." (".$sizedelta." bytes too small))\n";
+			}
+			if ( -e $cachefile && -r $cachefile && $length <= (stat($cachefile))[7] ) {
+				my $sizedelta = int((stat($cachefile))[7]) - int($length);
+				print "* [HTTP] Using cached file $cachefile ... (size:".(stat($cachefile))[7]."".
+					($sizedelta ? " (".$sizedelta." bytes bigger than expected)" : "(matches expected length)").")\n";
+				return {file=>$cachefile, status=>0};
+			}
+		}
+		print "* [HTTP] Downloading $url ...\n";
+		my $return = system("curl", "-s", "-L", "--create-dirs", "-o" , $deepcachefile, $url);
+		return {file=>$deepcachefile, status=>$return};
+	}
+	else {
+		return PODCAST_fetch($url,$prefix);
+	}
+}
+
+sub PODCAST_strictly_sanitze_path_element {
+	my ($name,$default) = @_;
+	$name =~ s/[^.0-9a-zA-z()_-]/_/g; # limit valid character set
+	$name =~ s/^[.]*//g; #remove leading dots
+	$name =~ s/[.]*$//g; #remove trailing dots (cause problems on windows i heard
+	$name = $default unless $name; #default if empty
+	return $name;
+}
+	
+
+sub PODCAST_get_sane_path_from_url {
+	my($uri,$default) = @_;
+	my($scheme, $authority, $path, $query, $fragment) = $uri =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
+	my @pathelements = ($authority, split (/\//, $path));
+	my @cleanpathelements=();
+	foreach my $pe ( @pathelements ) {
+		push @cleanpathelements, PODCAST_strictly_sanitze_path_element($pe,'');
+	}
+	my $cleanpath =  join ("/", @cleanpathelements);
+	$cleanpath =~ s|/[/]+|/|g; # collaps multiple /
+	$cleanpath = $default  if ! $cleanpath;
+	$cleanpath = $default  if $cleanpath eq "/";
+	return $cleanpath;
+}
+
 #############################################################
 #Eventer for START:
 # -> Push array if we found a new item beginning
@@ -558,14 +616,13 @@ sub resolve_podcasts {
 				warn "! [HTTP] Podcast $c_url ($c_title) exists, no need to download this file\n";
 				next;
 			}		
-			print "* [HTTP] Downloading $c_url ...\n";
-			my $rssmedia = PODCAST_fetch($c_url, "/tmp/gnupodcast_media");
+			my $rssmedia = PODCAST_fetch_media($c_url, "/tmp/gnupodcast_media", $podcast_item->{enclosure}->{length});
 			if($rssmedia->{status} or (!(-f $rssmedia->{file}))) {
 				warn "! [HTTP] Failed to download $c_url to $rssmedia->{file}\n";
 				next;
 			}
 			
-			$per_file_info{$rssmedia->{file}}->{UNLINK}    = 1;  # Remote tempfile
+			$per_file_info{$rssmedia->{file}}->{UNLINK}    = 1 unless $opts{'podcast-cache-dir'};  # Remove tempfile if not caching
 			$per_file_info{$rssmedia->{file}}->{ISPODCAST} = 1;  # Triggers mediatype fix
 			
 			# Set information/tags from XML-File
@@ -626,6 +683,7 @@ Usage: gnupod_addsong.pl [-h] [-m directory] File1 File2 ...
    -d, --duplicate                  Allow duplicate files
    -p, --playlist=string            Add songs to this playlist, can be used multiple times
        --playlist-is-podcast        Set podcast flag for playlist(s) created using '--playlist'
+       --podcast-cache-dir=string   Set a directory in which podcast media files will be cached.
        --podcast-files-limit=int    Limit the number of files that are downloaded.
                                     0 = download all (default), -X = download X oldest items, X = download X newest items
        --disable-v1                 Do not read ID3v1 Tags (MP3 Only)

_______________________________________________
Bug-gnupod mailing list
[email protected]
http://lists.nongnu.org/mailman/listinfo/bug-gnupod

[Bug-gnupod] Several fixes and features for podcast handling

Reply via email to