#! /usr/bin/perl
# working version, not documented
# news lent script (rss)
use XML::Parser::Lite;
use Socket;
use Encode 'from_to';

my $home="./dir";
my $temp=".";
my $xmlname="$home/newz";
my $htmlname="$home/newz";

$|=1;
my $proxy='195.50.2.154:8080';
my $html_enc = 'windows-1251';


my %cont=(
'mahatma.bspu.unibel.by'=>1,
'www.bspu.unibel.by'=>1
);

sub add_cont{
my $l=$_[0].'/';
my $x;
$l=~s/http\:\/\/(.*?)[\:\/]/$x=$1;''/gsei;
$cont{$x}=1;
}

sub esc{
local $1;
my $x=shift;
$x=~s/([\x00-\x1f,:\"\'\\\/])/sprintf('%%%02X',ord($1))/eg;
$x;
}

sub unesc{
my $x=shift;
local $1;
$x=~s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
$x;
}

sub get_xml{
my ($s,%h,@a,@a1,@ad);
@a=split(/:\/\//,$_[0],2);
unshift @a,'http' if(!defined($a[1]));
@a[1,3]=split(/\//,$a[1],2);
@ad=@a[1,2]=split(/:/,$a[1],2);
@a1=@a;
$ad[1]||=80;
$a1[0]&&="$a[0]://";
$a1[2]&&=":$a[2]";
$a1[3]&&="/$a[3]";
if($proxy){
 @ad=split(/:/,$proxy,2);
 $a1[3]=join('',@a1);
}
socket(SO,PF_INET,SOCK_STREAM,PROTO_TCP)&&
connect(SO,sockaddr_in($ad[1],inet_aton($ad[0])))||return;
select(SO);$|=1;select(STDOUT);
print SO qq($_[1] $a1[3] HTTP/1.1
Host: $a1[1]
User-Agent: robot
Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5
Accept-Language: ru,be;q=0.8,en-us;q=0.5,en;q=0.3
Accept-Encoding: none
$_[2]Connection: close

);
while((!eof(SO))&&defined(my $x=<SO>)){$s.=$x;$x=~s/[\r\n]*//gs;$x||last}
$s=~s/(.*?): (.*?)[\r]\n/$h{lc($1)}=$2/gise;
undef $s;
while((!eof(SO))&&defined(my $x=<SO>)){$s.=$x}
close(SO);
substr($s,index($s,'<?xml')),%h
}

sub url{
my $u=$_[0];
my $t=$_[1]||$u;
"<a href=\"$u\">$t</a>"
}

my (@block,%item,@items);

## 'id'=>[start,char,end,start1,char1.end1];
my %blocks=(
'rss.channel.item'=>[
sub{%item=()},
undef,
sub{push @items,{%item};undef %item},
undef,
sub{shift;$item{$block[$#block]}=join('',@_)},
undef
],
'rss.channel.item.link'=>[
]
);

#$blocks{'rdf:RDF.item'}=$blocks{'rss.channel.item'};

my %handlers=(
Start=>sub{parser_event(3,@_);push @block,$_[1];unshift @_,0;goto &parser_event},
Char=>sub{parser_event(1,@_);unshift @_,4;goto &parser_event},
End=>sub{parser_event(2,@_);while($_[1] ne pop @block){};unshift @_,5;goto &parser_event},
);


sub parser_event{
my $e=shift;
my $id=join('.',@block[0..$#block-($e>3)]);
#print "$id\n";
if(exists($blocks{$id})){
 my $h=$blocks{$id};
 goto ref($h)||return;
 HASH:return $h->{('Start','Char','End')[$e]}(@_);
 ARRAY:return defined(@$h[$e])?&{@$h[$e]}(@_):undef;
 SCALAR:return;
}
}


sub get_rss{
my ($x,%h,$ff,$ffb,$t,$p);
if(substr($_[0],0,7) eq 'file://'){
$ff=substr($_[0],7);
return if(!-e $ff);
goto FILE;
}
add_cont($_[0]);
#print "get $_[0]\n";
($x,%h)=get_xml($_[0],'HEAD');
return if(!defined(%h));
$h{'last-modified'}||=gmtime;
if(exists($h{'last-modified'})){
$ffb="$temp/newz-".esc($_[0]);
stat($ff="$ffb.".esc($h{'last-modified'}));
if(-e _){
FILE:
 open FF,"<$ff" or die "$!";
 sysread(FF,$x,-s FF);
 close(FF);
}else{
 ($x,%h)=get_xml($_[0],'GET');
 return if(!defined($x));
 while(my $d=<$ffb.*>){unlink($d);}
 open FF,">$ff" or die "$! $ff";
 print FF $x;
 close(FF);
}
}else{($x,%h)=get_xml($_[0],'GET');
$h{'last-modified'}.=localtime;
}
$p=new XML::Parser::Lite;
$p->setHandlers(%handlers);
my ($enc,@a);
$x=~s/\<\?xml(.*?)\>/push @a,$1;"\<\?xml$1\>"/ges;
for(@a){$_=~s/ encoding\=\"(.*?)\"/$enc=lc($1);''/ges}
from_to($x,$enc,'utf-8') if(defined($enc) && $enc ne 'utf-8');
$p->parse($x);
}

my %htm=(
'lt'=>'<',
'gt'=>'>',
'amp'=>'&',
'quot'=>'"'
);

sub dehtml{
my $s=shift;
$s=~s/\&(.*?)\;/$htm{$1}||"&$1;"/gse;
$s
}

my (@news0,%news,%nh);
my %mm=('Jan'=>0,'Feb'=>1,'Mar'=>2,'Apr'=>3,'May'=>4,'Jun'=>5,'Jul'=>6,'Aug'=>7,'Sep'=>8,'Oct'=>9,'Nov'=>10,'Dec'=>11);

sub addnews{
my ($l,$x);
my $u=$_[0];
for(my $i=$#items;$i>=0;$i--){
 $l=$items[$i]->{link};
 if($u){
  $x=$l.'&';
  $x=~s/[\&\?\;]$u\=(.*?)\&/$l=dehtml($1);''/gse;
  $l="http://$l" if(index($l,'://')==-1);
  $items[$i]->{xlink}=$l;
 };
 if(!exists($nh{$l})){
  $nh{$l}=$items[$i];
  my $t=$items[$i]->{pubDate};
  my $t1=0;
  $t=~s/([0-9]{2})\:([0-9]{2})\:([0-9]{2})/$t1=$3+($2+$1*60)*60;''/e;
  $t=~s/([0-9]{1,2}) ([a-zA-Z]{3}) ([0-9]{4})/$t1+=($1+$mm{$2}*31+$3*365)*24*60*60;''/e;
  $t=~s/\+0([0-9])00/$t1-=$1*60*60;''/ex;
  $t=~s/\-0([0-9])00/$t1+=$1*60*60;''/ex;
  $news{$t1}=$items[$i];
  add_cont($l);
 }
}
undef @items;
}



my $time=gmtime;
#print qq(Content-type: text/html; codepage=utf-8
#Last-modified: $time
#
my $time1=$time;
$time1=~s/ /&nbsp;/gs;

my $fh=qq(<html><head><title>Breaking News!</title>
<STYLE type="text/css">
body{margin-left:2em;margin-right:2em;text-align:justify;text-indent:2em;margin-top:0px;margin-bottom:0px;}
table{margin-left:0em;margin-right:0em;text-indent:0em;margin-top:0px;margin-bottom:0px;}
li{text-align:justify;text-indent:0em;margin-top:3px;margin-bottom:3px;}
</STYLE>
</head><body><table width=100% height=0 border=0 cellpadding=0 cellspacing=0><tr><td align=left valign=top><b><font size=+2 face=times>Open Source News</font></b></td><td align=right valign=top>$time1<br>).
url("dir/")."</td></tr></table><ul>";

get_rss("file://$xmlname.xml");&addnews;
#get_rss('http://newsrss.bbc.co.uk/rss/russian/institutional/pda/rss.xml');
get_rss('http://news.google.com/news?hl=en&ned=tus&ie=UTF-8&scoring=d&q=Belarus+OR+Belorussia+OR+Belarussian&output=rss');&addnews('url');
#get_rss('http://www.afn.by/news/rss/'); &addnews;
get_rss('http://news.tut.by/rss/all.rss');&addnews;
get_rss('http://www.charter97.org/export/index.xml');&addnews;
#get_rss('http://www.euronews.net/rss/euronews_ru.xml');&addnews;
get_rss('http://news.yandex.ru/Belarus/index.rss'); addnews('cl4url');

#get_rss('http://www.cnews.ru/news.xml');


for(sort keys %news){
unshift @news0,$news{$_}
}

open FF,">$xmlname.xml" or die "$!";

print FF q(<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0">
<channel>
<title>Breaking News!</title>
<link>http://news.eu.by/</link>
<description>Open Source News</description>);

for(@news0){
 $fh.="\n<li>".(exists($_->{'xlink'})?url(unesc($_->{'xlink'}),'<i>link</i>')." - <b>":'').url($_->{'link'},dehtml($_->{'title'})).'</b> '.dehtml($_->{'description'});
 print FF "\n<item>";
 for my $tag(keys %{$_}){print FF "<$tag>$_->{$tag}</$tag>"}
 print FF "</item>";
}
print FF "\n</channel></rss>";
close FF;

$fh.="\n</ul><hr>Contributors:<br>";

for(sort keys %cont){
 $fh.=url("http://".unesc($_)).'<br>';
}

from_to($fh,'utf-8',$html_enc);
open FH,">$htmlname.tmp.html" or die "$! $htmlname.tmp.html";
print FH $fh;
close(FH);

print FH "\n</ul></body></html>\n";
close(FH);

if($ARGV[0] eq 'rotate'){
 for('xml','html'){
  my $x=eval("\$".$_."name");
  my $f0="$x.$_";
  my $f1="$x.".quotemeta($time).".$_";
  `mv -f $f0 $f1` if(-e $f0);
 }
}
`mv -f $htmlname.tmp.html $htmlname.html`;
