from bs4 import BeautifulSoup from bs4 import Tag import csv import string import unicodedata def debug(s) : print (":".join("{0:x}".format(ord(c)) for c in s)) def makeascii (s) : return (unicodedata.normalize('NFKD', s).encode('ascii','ignore')) def islatlong (tag) : if (tag.name != "td") : return False for x in tag.parent.children : if isinstance (x, Tag) and x.name == "th" and (x.string=="Latitude:" or x.string=="Longitude:") : return True return False # take a string and convert to decimal degrees def decimalDegrees (s) : s = makeascii (s) l = s.split() if len(l) != 4 : return 0.0 degrees = float(l[0]) minutes= float(l[1]) seconds= float (l[2]) dd = degrees + minutes/60.00 + seconds/3600.00 if l[3] in [ "S", "W" ] : dd = 0 - dd return dd def getrow (tr): #print (tr) th1=tr.th.string tds="" for td1 in tr.td.children : #debug (td1.string) tds=tds + " " + (td1.string) #print tds return tds.strip() with open("places.csv", "wb") as csvfile, open("hist.csv", "wb") as histfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) #histwriter = csv.writer(histfile, delimiter=',', # quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(["Number","Name","Type","LGA","Status","Date","Page", "Plan","Narrative", "AltName", "HistName", "Latitude","Longitude","Datum"]) #histwriter.writerow(["Number","Name","HistNum","HistName"]) for i in range(1, 50000) : fi="entries/qhr-" + str(i) f = open(fi,"r") html_doc=f.read() soup = BeautifulSoup(html_doc) #print (soup.original_encoding) soup = soup.find(id="bodytext") print (i) if soup.h2.string == None : print ("No title") continue; name= str(string.capwords(soup.h2.string)) if name == "" : print ("No name") continue; row = [ i, name ] #print (name) for tr in soup.find_all("tr") : if tr.th.string in [ "Gazettal", "Location", None] : foo = 0 elif tr.th.string in ["Latitude:", "Longitude:"] : r = getrow (tr) #print (r) row = row + [ decimalDegrees (r) ] elif tr.th.string in ["Alternative name"] : if tr.td.string == None : row = row + [ "" ] else : row = row + [ tr.td.string ] elif tr.th.string in ["Historical name"] : alist = tr.find_all("a") for a in alist : href=a.get("href") n = href.rfind("=") histrow = [ i, name, href[n:], a.string] histwriter.writerow(histrow) if len(alist) == 0 : row = row + [ "No" ] else : row = row + [ "Yes" ] else : r = getrow (tr) row = row + [ r ] #print (row) csvwriter.writerow(row)