Use SGML paraser
|
import sgmllib import string
filename = "index.html" class CleanExit(Exception): pass
class Titlefinder(sgmllib.SGMLParser): def __init__(self, verbose=0): sgmllib.SGMLParser.__init__(self, verbose) self.title = self.data = None def start_title(self, attributes): self.data = [] def end_title(self): self.title = string.join(self.data, "") raise CleanExit def handle_data(self, data): if self.data is not None: self.data.append(data)
def get_title(filehandle): Parser = Titlefinder() try: while 1: sgmldata = filehandle.read(1024) if not sgmldata: break Parser.feed(sgmldata) Parser.close() except CleanExit: return Parser.title return None
filehandle = open(filename) title = get_title(filehandle)
print "The page's title is: %s" % (title)
|
|
|
|
|