TkkrLab:Proxychecker
From
For my project I'm scraping all kinds of websites. And the idea is to use proxy's when connect to the website multiple times. So what I'm doing is, I connect to hidemyass proxy list with a tor connection, after scraping the website I will check if the proxy is useable by connecting to watismijnip.nl. If the proxy can be used return the proxy else don't do anything.
import urllib2 import lxml.html as html import time class browser(): def tor(self,url): proxy_handler = urllib2.ProxyHandler({'http': "127.0.0.1:8118"}) proxy_auth_handler = urllib2.ProxyBasicAuthHandler() request = urllib2.build_opener(proxy_handler, proxy_auth_handler) request.addheaders= [('User-agent', "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; en) Opera 9.50")] return request.open(url).read() def proxy(self,url,proxy): try: proxy_handler = urllib2.ProxyHandler({'http': proxy}) proxy_auth_handler = urllib2.ProxyBasicAuthHandler() request = urllib2.build_opener(proxy_handler, proxy_auth_handler) request.addheaders= [('User-agent', "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; en) Opera 9.50")] return request.open(url).read() except (urllib2.URLError, httplib.HTTPException, urllib2.HTTPError), e: return e class scrape: def proxylist(self,url): doc = html.fromstring(url) content = ([tr.xpath('td[position()=2]//span/text() | td[position()=3]/text()') for tr in doc.xpath('//tr')]) proxylist = [] for row in xrange(len(content)-1): row +=1 IP = content[row][0] PORT = content[row][1][1:] proxylist.append((IP, PORT)) return proxylist scrapelist = scrape() website_tor = browser() proxylist = scrapelist.proxylist(website_tor.tor("http://hidemyass.com/proxy-list/")) for i in xrange(len(proxylist)): IP = proxylist[i][0] port = proxylist[i][1] proxy = IP +":"+port website = website_tor.proxy("http://www.watismijnip.nl", proxy) if "<title>www.WatIsMijnIP.nl ->" in website: print proxy time.sleep(2)
As result 49 working proxy's
174.139.204.221:80 200.241.33.164:80 187.102.64.137:8080 193.131.184.102:3128 193.230.156.89:3128 190.85.86.146:8080 180.96.19.196:3128 91.98.117.29:8080 204.188.215.52:3128 82.207.109.251:3128 222.171.176.109:8080 190.90.158.195:8080 212.55.225.162:8080 177.21.119.9:3128 187.54.67.169:3128 187.18.240.87:3128 82.131.174.21:8080 82.114.82.60:8080 77.123.88.13:3128 200.163.51.83:8080 200.241.69.44:3128 119.115.136.62:8080 190.196.19.157:3128 84.22.3.32:8080 200.223.17.203:8080 190.12.6.212:3128 196.1.120.75:81 219.159.105.180:8080 221.195.42.195:8080 178.48.2.237:8080 66.178.57.4:80 190.249.188.220:8080 190.183.236.134:8080 218.25.249.186:80 173.59.48.13:3128 178.159.250.7:80 201.150.2.170:8080 200.255.18.61:3128 113.230.76.234:443 114.33.112.160:8181 41.189.36.26:3128 113.108.219.46:8080 202.51.116.177:8888 57.90.36.24:80 24.100.137.18:8080 186.195.172.68:80 61.157.217.31:80 157.100.157.154:3128 212.118.23.98:8080
The code is a bit ugly but it works.