#!/usr/bin/python
#
# Link checker 1.0
# A.P.Selby 2005ish
# 
# Links can arise from attributes (see http://www.w3.org/TR/REC-html40/index/attributes.html):
#  action (form)
#  archive (applet, object) [In list form]
#  background (body)
#  cite (blockquote, q, del, ins)
#  classid (object)
#  data (object)
#  href (a, area, link)
#  longdesc (img, frame, iframe)
#  profile (head)
#  src (script, input, frame, iframe, img)
#  usemap (img, input, object)
# and apart from base->href, being an attribute above implies that it is a link
#
# Base-changing commands can arise from attributes:
# codebase (object, applet)
# href (base)
#
# Can we have funny stuff like fred.gif in http://www.fred/fred.blah being a
# directory, and so implying http://www.fred/fred.blah/index.html, or whatever?
# Yes it appears so, but let the server decide it.
#
# At the moment it will follow crazy things like
# <p href="fred">, but on the other hand, it might have a better chance of picking
# up a new (element,attribute) pair if it doesn't screen the element.
#
# If you use urllib.urlopen(), it will load whatever the server deems to be the error page
# if there is a 404, making it hard to distinguish errors and non-errors.
# If you use urllib.URLopener({}).open(), it will cause an IOError exception if there is a 404, but
# it is too sensitive in that it will also cause an exception if, e.g., there is a 301 (Moved Permanently)
# Looks like the thing to do is use FancyURLopener and override http_error_default. This doesn't
# give exceptions by default, but gives a hook to trap errors.
#
# The top node is a special case in that it is by definition "internal" even if geturl()
# returns something which might suggest otherwise
#
# If there is a way to send an http request which just asks if a page exists, then this
# could speed things up a lot for pages which you know you aren't going to parse (e.g.,
# external pages, and perhaps *.jpg etc).
#
# Another improvement would be to do things in parallel, so that you are never waiting for
# a website to return a result. (Perhaps somehow reduce the timeout as a stopgap measure.)
#
# Simple improvement: have a max depth
# 
# Medium improvement: have the partial results available as they come in
#
# Difficult improvement: follow javascript links
#
# Some sites require a username, password and urllib.openerthingy stops to get keyboard input
#
# To reload results, use:
# import pickle
# f=file('results','r');results=pickle.load(f);f.close()
#
# Perhaps should respect robots.txt (with an option not to)
# Actually, perhaps not
#
# Fix the google-anti-scrape thing. (Just to special case, and not actually run a google search)
#
# Would be useful to have the ability to override the assumption that 'internal' is determined
# by having the same prefix as the original page. (Use regexp)
#
# Would be nice if the parser were more robust when encountering parse errors and could
# carry on (assuming a browser would)
# e.g.,
#<a href="tours.html"img border="0" src="../../images/tourssm.gif" width="25" height="17"></a></td>
#<td valign="middle">
# will cause a parse error, whereas browsers will continue, so although the page will
# be flagged by the link checker as invalid, it will miss the tree underneath the rest of
# the page.

import HTMLParser,string,urllib,re

class fuo(urllib.FancyURLopener):
  def http_error_default(self, url, fp, errcode, errmsg, headers):
    #print "***",errcode
    raise IOError
    #urllib.FancyURLopener.http_error_default(self, url, fp, errcode, errmsg, headers)

def basify(s):
  x=string.find(s,'//')
  y=string.rfind(s,'/',x+2)
  if y!=-1: s=s[0:y+1]
  else: s=s+'/'
  return s

# e.g., x/a/b/../../c/./../.d/e/f/././../.././g/. --> x/.d/g
def normalise(s):
  x=string.find(s,'#')
  if x!=-1: s=s[0:x]
  t=''; p=0
  while True:
    x=string.find(s,'/.',p)
    if x==-1: t+=s[p:];break
    t+=s[p:x]
    if x+2==len(s) or s[x+2]=='/': p=x+2;continue
    if s[x+2]=='.' and (x+3==len(s) or s[x+3]=='/'):
      y=string.rfind(t,'/')
      if y==-1:
        t=''
      else:
        t=t[:y]
      p=x+3
      continue
    t+=s[x:x+2]
    p=x+2
  return t

def html(s):
  s=string.rstrip(s)
  return s[-5:]=='.html' or s[-4:]=='.htm' or s[-6:]=='.shtml' or s[-4:]=='.php' or s[-5:]=='.php3'

class lookat(HTMLParser.HTMLParser):
  def __init__(self,url):
    self.follow={'action':0, 'background':0, 'cite':0, 'classid':1, 'codebase':1,
                 'data':1, 'href':0, 'longdesc':0, 'profile':0, 'src':0, 'usemap':0}
    # We don't actually follow 'codebase' - this will be a hand-coded exception.
    self.base=['','']
    self.url=''
    self.n=0
    self.level=0
    self.queue=[url]
    self.results={url:["OK",True,0,{},{}]}
    HTMLParser.HTMLParser.__init__(self)

# Could use urllib.basejoin(), but this says basejoin('http://fred/a','b') = http://fred/b, which is funny
  def combine(self,a):
    if a[1]:
      c=string.lstrip(a[1])
    else:
      c='';
    if re.match('[A-Za-z]+:',c):
      return normalise(c)
    if c=='' or c[0]=='#':
      return normalise(self.url) # href="" and "#fred" refer to the current page, not the base
    b=self.base[self.follow[a[0]]]
    if c[0]=='/':
      x=string.find(b,'//')
      if x==-1: return normalise(b+c) # shouldn't happen
      x=string.find(b,'/',x+2)
      if x==-1: return normalise(b+c)
      return normalise(b[:x]+c)
    x=len(b)
    while x>0 and b[x-1]=='/': x-=1
    return normalise(b[:x]+'/'+c)

  def handle_starttag(self, tag, attrs):
    if tag=='base':
      for a in attrs:
        if a[0]=='href':
          self.base[0]=basify(self.combine(a))
      return
    for a in attrs:
      if a[0]=='codebase':
        self.base[1]=basify(self.combine(a))
      elif self.follow.has_key(a[0]):
        c=self.combine(a)
        if not results.has_key(c):
          results[c]=["OK",c[0:len(topbase)]==topbase,self.level+1,{},{}]
          if self.level<maxd: self.queue.append(c)
          self.n+=1
        results[self.url][4][c]=0
        results[c][3][self.url]=0
    return
  def handle_endtag(self, tag):
    return
  def handle_data(self,data):
    return

def sd(d): x=d.keys();x.sort();return x
def descerr(x):
  if x=="OK": return ""
  if x=="OpenError": return x
  return "Parse error at line %d, byte %d" % x
def descv(x):
  if x=="OK": return x
  return descerr(x)
def descserr(x):
  y=descerr(x)
  if y!="": y=" "+y
  return y
  
if __name__ == '__main__':
  import os,sys,urllib,pickle
  if len(sys.argv)<=1:
    print 'usage: link.py <url> [,<output directory> [,<maxdepth>] ]'
    sys.exit(0)
  url=sys.argv[1]
  if len(sys.argv)>=3: outdir=sys.argv[2]
  else: outdir='report'
  if len(sys.argv)>=4: maxd=int(sys.argv[3])
  else: maxd=1000000
  
  # Format of results:
  # A valid url is one which is not broken, i.e., one which was actually visited
  # Dictionary indexed by urls, valid or not
  # Entries are tuples ("Valid" or "OpenError" or (l,c) tuple which means can't parse at line l, column c
  #                     flag to say whether internal,
  #                     level,
  #                     dictionary of (valid) urls which link to this url,
  #                     dictionary of urls (valid or not) which this url links to)

  opener=fuo({}) # Any proxy servers would go in this dictionary
  topbase=''
  num=0
  lpar=lookat(url)
  results=lpar.results
  while len(lpar.queue)>0:
    x=lpar.queue.pop(0)
    num+=1
    print "%4d %4d %2d  Visiting %s" % (num,len(lpar.queue)+1,results[x][2],x)
    if string.lower(x[0:5])=='http:' or string.lower(x[0:5])=='file:':
      try:
        y=opener.open(x)
      except IOError:
        results[x][0]="OpenError"
      #print x,": GH", results[x][0], results[x][1], y.info().getheader('content-type')
      if results[x][0]=="OK" and results[x][1] and y.info().getheader('content-type')[0:9]=='text/html':
        lpar.url=x
        lpar.level=results[x][2]
        lpar.base[0]=lpar.base[1]=basify(y.geturl())
        if topbase=='': topbase=lpar.base[0]
        try:
          lpar.reset()
          lpar.feed(y.read())
          lpar.close()
        except HTMLParser.HTMLParseError:
          results[x][0]=lpar.getpos()
    
  if results[url][0]!="OK": print "Error with %s: %s\n" % (url,descv(results[url][0]))

  try:
    os.stat(outdir)
  except OSError:
    os.mkdir(outdir)
  f=file(outdir+'/results','w');pickle.dump(results,f);f.close()
  f=file(outdir+'/graph','w')
  urls=sd(results)
  for x in urls:
    print >>f, "="*(len(x)+4)
    print >>f, "|",x,"|"
    print >>f, "="*(len(x)+4)
    print >>f, "Status:",descv(results[x][0])
    print >>f, "Internal:",results[x][1]
    print >>f, "Level:",results[x][2]
    z="None"
    for y in results[x][3]:
      if results[y][2]==results[x][2]-1:
        z=y
    print >>f, "Parent:",z
    for y in sd(results[x][3]): print >>f, "<",y
    for y in sd(results[x][4]):
      if results.has_key(y) and results[y][0]=="OK": z=' '
      else: z='*'
      print >>f, ">"+z,y
    print >>f
  f.close()
  f=file(outdir+'/brokengraph','w')
  for x in urls:
    if results[x][0]!="OK":
      print >>f, "="*(len(x)+4)
      print >>f, "|",x,"|"
      print >>f, "="*(len(x)+4)
      print >>f, "Status:",descv(results[x][0])
      print >>f, "Internal:",results[x][1]
      print >>f, "Level:",results[x][2]
      z="None"
      for y in results[x][3]:
        if results[y][2]==results[x][2]-1:
          z=y
      print >>f, "Parent:",z
      for y in sd(results[x][3]): print >>f, "<",y
      print >>f
  f.close()
  f=file(outdir+'/summary','w')
  for x in urls:
    if results[x][0]=="OK": z='V'
    else: z='.'
    f.write(z)
    if results[x][1]: z='I'
    else: z='.'
    f.write(z+' ')
    print >>f,x
  f.close()
  f=[file(outdir+'/brokenexternal','w'),file(outdir+'/validexternal','w'),
     file(outdir+'/brokeninternal','w'),file(outdir+'/validinternal','w')]
  for x in urls: print >>f[int(results[x][0]=="OK")+2*int(results[x][1])],"%s%s" % (x,descserr(results[x][0]))
  for x in f: x.close()
  print "Output written to directory '%s'" % outdir
  sys.exit(0)
