|
| 1 | +import requests |
| 2 | +from bs4 import BeautifulSoup |
| 3 | +import pandas |
| 4 | + |
| 5 | +r=requests.get("http://www.pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/") |
| 6 | + |
| 7 | +# store content of requests object... |
| 8 | +c=r.content |
| 9 | +#print(c) |
| 10 | + |
| 11 | +soup=BeautifulSoup(c,"html.parser") |
| 12 | +#print(soup.prettify()) |
| 13 | + |
| 14 | +# generate's a list object of all <div> elements... |
| 15 | +all=soup.find_all("div",{"class":"propertyRow"}) |
| 16 | + |
| 17 | +all[0].find("h4",{"class":"propPrice"}).text.replace("\n","") |
| 18 | + |
| 19 | +page_nr=soup.find_all("a",{"class":"Page"})[-1].text # soup returns a string... |
| 20 | +print(page_nr) # page_nr is a string... you must convert to int... |
| 21 | + |
| 22 | +l=[] |
| 23 | +base_url="http://www.pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s=" |
| 24 | +for page in range(0,int(page_nr)*10,10): # start at 0, upto page, and iterate by 10... |
| 25 | + print(base_url+str(page)+"html") |
| 26 | + r=requests.get(base_url+str(page)+".html") |
| 27 | + c=r.content |
| 28 | + soup=BeautifulSoup(c,"html.parser") # BeautifulSoup returns a list object... |
| 29 | + all=soup.find_all("div",{"class":"propertyRow"}) |
| 30 | + for item in all: |
| 31 | + # Dictionary, key/value pairs to store the scraped property info... |
| 32 | + d={} |
| 33 | + item.find() # grabs first occurence of item being searched |
| 34 | + # span tag below has two elements of data, ergo index 0 and index 1 |
| 35 | + d["Address"]=item.find_all("span",{"class","propAddressCollapse"})[0].text # returns a list with specified criterior result... |
| 36 | + try: |
| 37 | + d["Locality"]=item.find_all("span",{"class","propAddressCollapse"})[1].text |
| 38 | + except: |
| 39 | + d["Locality"]=None |
| 40 | + |
| 41 | + d["Price"]=item.find("h4",{"class","propPrice"}).text.replace("\n","").replace(" ","") |
| 42 | + try: |
| 43 | + d["Beds"]=item.find("span",{"class","infoBed"}).find("b").text |
| 44 | + except: |
| 45 | + d["Beds"]=None |
| 46 | + |
| 47 | + try: |
| 48 | + d["Area"]=item.find("span",{"class","infoSqFt"}).find("b").text |
| 49 | + except: |
| 50 | + #pass # this makes python ignore any would be exception... |
| 51 | + d["Area"]=None |
| 52 | + try: |
| 53 | + d["Full Baths"]=item.find("span",{"class","infoValueFullBath"}).find("b").text |
| 54 | + except: |
| 55 | + d["Full Baths"]=None |
| 56 | + |
| 57 | + try: |
| 58 | + d["Half Baths"]=item.find("span",{"class","infoValueHalfBath"}).find("b").text |
| 59 | + except: |
| 60 | + d["Half Baths"]=None |
| 61 | + |
| 62 | + for column_group in item.find_all("div",{"class":"columnGroup"}): |
| 63 | + print(column_group) |
| 64 | + for feature_group, feature_name in zip(column_group.find_all("span",{"class":"featureGroup"}),column_group.find_all("span",{"class":"featureName"})): |
| 65 | + print(feature_group.text, feature_name.text) |
| 66 | + if "Lot Size" in feature_group.text: |
| 67 | + d["Lot Size"]=feature_name.text |
| 68 | + |
| 69 | + # append the dict to the list... |
| 70 | + l.append(d) |
| 71 | + |
| 72 | +import pandas |
| 73 | +df=pandas.DataFrame(l) |
| 74 | +df |
| 75 | +df.to_csv("martyOutput.csv") |
0 commit comments