#! /local/bin/perl use strict; use CGI qw/:standard/; use HTTP::Request::Common; use LWP::UserAgent; use HTTP::Cookies; my $ua = LWP::UserAgent->new; # Some sites flip out if you don't have a user agent set. $ua->agent("Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)"); my $PROXY = ""; my $BASE = "http://www.redbrick.dcu.ie/~shimoda/proxy/prox.cgi"; my $BASE_URL = $BASE . "?url="; my $COOKIE_DIR = "/webtree/s/shimoda/proxy/cookies/"; my $FORM_POST_URL = ""; my $BASE_FETCH_URL; my $HTTP = "http://"; my $REPOST = "REPOST"; my $passedInURL = ""; my $relativePath = ""; my $REMOTE_USER = ""; my $CGI_QUERY = new CGI; my $GET = 1; my $POST = 0; my $cookie_jar = HTTP::Cookies->new(ignore_discard => 1); my $cookiefile; my $cookieExpireSecs = 1200; # 20 Minutes. open(ERROR_LOG, ">>/webtree/s/shimoda/logs/proxyLog2.txt"); # Make stdout hot. my $ofh = select STDOUT; $| = 1; select $ofh; sub logUserInfo { my $remoteHost = $CGI_QUERY->remote_host(); my $referrer = $CGI_QUERY->referer(); my $time = localtime(); open(LOG, ">>/webtree/s/shimoda/logs/proxyLog.txt"); print LOG "[$time] - User: \"$REMOTE_USER\", IP: \"$remoteHost\",URL:\"$passedInURL\",URL2:\"$BASE_FETCH_URL\", Referer:\"$referrer\"\n"; close(LOG); } # Method to try four different proxies to determine which is the best to use. sub findProxy { my @proxies; push(@proxies,'http://proxy1.dcu.ie:3128'); push(@proxies,'http://proxy2.dcu.ie:3128'); push(@proxies,'http://proxy3.dcu.ie:3128'); my $ua = LWP::UserAgent->new; foreach my $p (@proxies) { $ua->proxy(['http', 'ftp'], $p); my $URL = "http://www.redbrick.dcu.ie"; my $res = $ua->request(GET $URL); if($res->is_success) { $PROXY = $p; last; } } } sub makeRequest { my ($url, $getOrPost, $content) = (@_); my $req = HTTP::Request->new(); if($getOrPost == $GET) { $req->method("get"); } else { $req->method("post"); $req->content($content); $req->push_header("Content-Length" => length($content)); } $req->uri($url); my $referer = $url; if($CGI_QUERY->referer()) { $referer = $CGI_QUERY->referer(); } $req->push_header("Referer" => $referer); $cookie_jar->add_cookie_header($req); return $ua->request($req); } sub getURLContent { my ($url) = (@_); if($PROXY !~ /^$/) { $ua->proxy(['http', 'ftp'], $PROXY); } my $res; my $content=""; my $typeOfRequest = $GET; my $responseCode = 0; my $fullResponse; my $newCookie = 0; # If we have a form post url, then it's a redirected post # on the original web page. # Parse the base URL and the content from the query string, # then post the information to the server and parse the results. # Sometimes the "post" method isn't implemented, in the case of google # specificially. if($FORM_POST_URL)# && $FORM_POST_URL !~ /google/) { # The base url is anything before the first slash, # the content is anything after the "?". my ($basicUrl) = $FORM_POST_URL =~ m/(.*?)\?/; ($content) = $FORM_POST_URL =~ m/.*?\?(.*)/; print ERROR_LOG "Posting to \"$basicUrl\", content: \"$content\"\n"; # Populate the request with the necessary info. $res = makeRequest($basicUrl, $POST, $content); $typeOfRequest = $POST; } # Otherwise it's a standard "get". else { $res = makeRequest($url, $GET, $content); $typeOfRequest = $GET; } $responseCode = $res->code; $newCookie = $responseCode != 200; $cookie_jar->extract_cookies($res); # Process a redirect. if($responseCode =~ m/30\d/) { my $redirect = $res->header("Location"); if($redirect =~ m#^\?#) { $redirect = $url . $redirect; } print ERROR_LOG "Code $& returned: \"$redirect\", original: \"$url\"\n"; $res = makeRequest($redirect, $GET, $content); $typeOfRequest = $GET; } # To be implemented. # Response indicates that authentication is required. # elsif($responseCode == 401) # { # $ua->credentials( # 'www.redbrick.dcu.ie:80', # $realm, # $username,$password); # $res = makeRequest($url, $GET, $content); # } elsif($responseCode == 405) { # We did a "post" and they want a "get" or vice versa. # So send the URL again as the opposite request method. # print ERROR_LOG "Code 405 returned"; my $get = $res->header("Allow") =~ m/GET/i; $res = makeRequest($url, $get, $content); } elsif($responseCode == 501) { $res = makeRequest($url, !$typeOfRequest, $content); $typeOfRequest = !$typeOfRequest; } elsif($responseCode == 204) { $fullResponse = $res->as_string; $fullResponse =~ s/\n//g; return printError($responseCode, $fullResponse); } if($newCookie) { $cookie_jar->extract_cookies($res); } # Spit out the URL content if it succeeds, and a nice error if it doesn't. if($res->is_success) { print ERROR_LOG "Reponse code \"$responseCode\" for user: \"". $CGI_QUERY->user_name() . "\", url: \"$url\"\n"; $cookie_jar->save($cookiefile); return $res->content(); } else { $fullResponse = $res->as_string; $fullResponse =~ s/\n//g; print ERROR_LOG "Getting:\"$url\" - Response Code: \"$responseCode\"\n"; print ERROR_LOG "Error:\"" . $res->as_string . "\"\n"; } my $error = printError($responseCode, $fullResponse); return $error; } sub printError { my ($responseCode, $fullResponse) = (@_); my $c; $c .=< Oops...

The server didn't return any content for this URL. Probably just a bug in the script, glitch in the matrix, or your tinfoil hat is letting in too many mind controlling rays from the CIA.

The error code returned was: $responseCode


Full response:
$fullResponse

END return $c; } sub printHeaders { print header; } # Replace all links in the document to links pointing back to the # script. Convert all relative links, images, CSS, and JS to absolute # references. sub replaceLinks { my ($content) = (@_); my $backupContent = $content; my $relativeCSSURL = ""; if($relativePath) { $relativeCSSURL = $relativePath; } # Some sites use a BASE HREF="" to define relative links # So we need to strip this out and use it as our relative base, # For when we're compiling absolute URLS. $content =~m# <\s*base\s*href=\s*\"(.*?)/?\" #gixs; if($1) { $relativeCSSURL = $1; $relativeCSSURL =~ s#\Q$BASE_FETCH_URL\E##; } # If we've successfully found a new correct relative path, use it. if($relativeCSSURL !~ m/^\/?$/) { $relativePath = $relativeCSSURL; } # Four seperate regexes to parse the document in one pass, # in order of frequency: 1. hyperlinks, 2. images, 3. css, 4. meta refreshes. # Captures links of the form ']+) ) /ixo; # Meta refresh tags look something like # my $metaRefreshRegex = qr/ content\s*=\s*["']\d+;\s* url\s*=\s*(.*?)["'] /ixo; # Captures CSS links my $cssRegex = qr/ (?: # CSS along the lines of (?:\@import) | # '@import "blah.css"' (?:href\s*=) # or '' # # This allows us to determine that there is more content # in the query string and lets us repost the form to the correct # place. # This used to look like ["'](.*?)["'] # But some sites don't put parenthesis around their links. # <\s*form(.*?)action\s*=\s*["']?([\w\d\+-/;=:\?\&]+)["'\s]?(.*?)> $backupContent =~ s# <\s*form(.*?)action\s*=\s*["']?([^"'\s]+)(.*?)> # "\n". "\n" . "\n" #gixse; # Change any "post" action requests into "get" requests. # Should work for ~most actions. # The script takes the request in as a "get" and then correctly # Sends off a "post" request to the remote server. $backupContent =~ s#method\s*=\s*["'].*?["']#method="get"#ig; return $backupContent; } sub convertRelativeLinkToAbsolute { my ($link) = (@_); $link =~ s#(\.\./)+#/#g; $link =~ s/["']//g; $link =~ s#^\./(.*)#$1#; # Tests if the link is absolute (Starts with http://something. if($link !~ m/$HTTP.*/gi) { # Link starts with "//". Some older crap sites have this. # *cough* slashdot *cough* if($link =~ m#^//#) { return "http:" . $link; } # Link starts with a slash like href="/something" # or href='/something' if($link !~ m#^/.*#) { # Make any links which link to relative paths link to absolute references. if($relativePath) { $link = $BASE_FETCH_URL . $relativePath . "/" . $link; } else { $link = $BASE_FETCH_URL . "/" . $link; } } else { $link = $BASE_FETCH_URL . $link; } } return $link; } sub processCookie { my ($baseDomain) = $BASE_FETCH_URL =~ m#\Q$HTTP\E.*?\.(.*)#; $cookiefile = $COOKIE_DIR . $REMOTE_USER . "-" . $baseDomain; my @fileStatus = stat($cookiefile); # Find out if the cookie has expired. # Ignore it if it has, otherwise load the cookie and touch the file. if((@fileStatus) && ($fileStatus[9] + $cookieExpireSecs > time())) { $cookie_jar->load($cookiefile); utime (time, time, $cookiefile); } } sub processForm { if (defined $ENV{"QUERY_STRING"}) { $CGI_QUERY = new CGI($ENV{"QUERY_STRING"}); $REMOTE_USER = $CGI_QUERY->user_name(); my %params = $CGI_QUERY->Vars; my $actionRepost = %params->{$REPOST}; if($actionRepost) { $FORM_POST_URL = decodeUrl($ENV{"QUERY_STRING"}); $FORM_POST_URL =~ s/url=(.*?)\&/$1?/; $passedInURL =~ s/REPOST=1//; } $passedInURL = %params->{"url"}; parseCorrectURLInfo($passedInURL); processCookie(); } } sub parseCorrectURLInfo { my ($url) = (@_); if($url) { $passedInURL = decodeUrl($url); if($passedInURL !~ /^$HTTP/) { $passedInURL = $HTTP . $passedInURL; } $BASE_FETCH_URL = $passedInURL; $passedInURL =~ m#($HTTP.*?)/#i; if($1) { $BASE_FETCH_URL = $1; } $passedInURL =~ m#$HTTP.*?(/.*)/#; if($1 && $passedInURL !~ m/^$1$/i) { $relativePath = $1; } if($relativePath =~ m#^$HTTP#i) { $relativePath = ""; } return; } } sub decodeUrl { my ($urlToDecode) = (@_); $urlToDecode =~ s/%([A-Fa-f\d]{2})/chr hex $1/eg; return $urlToDecode; } sub encodeUrl { my ($urlToCode) = (@_); $urlToCode =~ s/([^\w()'*~!.-])/sprintf '%%%02x', ord $1/eg; return $urlToCode; } sub addSuperDuperSexyForm { my ($content) = (@_); $content =~ s# # #xi; $content =~ s# <\s*body(.*?)> # #xis; return $content; } ### Begin the script # Parse the passed in URL. processForm(); # Determine which Proxy to use. findProxy(); # Grab the URLs content, if they have # entered a URL, grab that one, otherwise # just grab the base one. my $content = ""; if($FORM_POST_URL) { $content = getURLContent($FORM_POST_URL); } elsif($passedInURL) { $content = getURLContent($passedInURL); } elsif($BASE_FETCH_URL) { $content = getURLContent($BASE_FETCH_URL); } else { $content .=< Boing

This proxy is designed to fetch pages internal to DCU for associates of redbrick, or for redbrick members who are offsite.

It will also allow access to content to which DCU has subscribed; Redbrick membership provides you with access to this content.

For example:

Kindly direct any suggestions / bug reports / COLD HARD CASH to shimoda at redbrick.

END } # Replace all links to point back to our script # Change any relative links to absolute for # images, CSS, JS, etc $content = replaceLinks($content); # Add in a handy form to allow them to select a site. $content = addSuperDuperSexyForm($content); # Keep a track of who gets any use from this thing. logUserInfo(); # Output any relevant headers printHeaders(); # Close the error log close(ERROR_LOG); close(ERROR_LOG); # Display our modified content print $content; 1;