This wiki has been archived and made read-only.
For up-to-date information about TkkrLab and it's projects please visit our main website at tkkrlab.nl.

TkkrLab:Proxychecker

From

Jump to: navigation, search

For my project I'm scraping all kinds of websites. And the idea is to use proxy's when connect to the website multiple times. So what I'm doing is, I connect to hidemyass proxy list with a tor connection, after scraping the website I will check if the proxy is useable by connecting to watismijnip.nl. If the proxy can be used return the proxy else don't do anything.

import urllib2
import lxml.html as html
import time

class browser():
    def tor(self,url):
    
            proxy_handler = urllib2.ProxyHandler({'http': "127.0.0.1:8118"})
            proxy_auth_handler = urllib2.ProxyBasicAuthHandler()
            request = urllib2.build_opener(proxy_handler, proxy_auth_handler)
            request.addheaders= [('User-agent', "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; en) Opera 9.50")]
            return request.open(url).read()
        
    def proxy(self,url,proxy):
        try:
            proxy_handler = urllib2.ProxyHandler({'http': proxy})
            proxy_auth_handler = urllib2.ProxyBasicAuthHandler()
            request = urllib2.build_opener(proxy_handler, proxy_auth_handler)
            request.addheaders= [('User-agent', "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; en) Opera 9.50")]
            return request.open(url).read()
        except (urllib2.URLError, httplib.HTTPException, urllib2.HTTPError), e:
            return e

class scrape:            

    def proxylist(self,url):
        
        doc = html.fromstring(url)        
        content = ([tr.xpath('td[position()=2]//span/text() | td[position()=3]/text()')
                    for tr in doc.xpath('//tr')])
        
        proxylist = []
        for row in xrange(len(content)-1):
            row +=1
            IP = content[row][0]
            PORT = content[row][1][1:]
            proxylist.append((IP, PORT))
            
        return proxylist


scrapelist = scrape()
website_tor = browser()
proxylist = scrapelist.proxylist(website_tor.tor("http://hidemyass.com/proxy-list/"))

for i in xrange(len(proxylist)):
    IP = proxylist[i][0]
    port = proxylist[i][1]
    proxy = IP +":"+port
    website = website_tor.proxy("http://www.watismijnip.nl", proxy)
    if "<title>www.WatIsMijnIP.nl ->" in website:
        print proxy
        time.sleep(2)

As result 49 working proxy's

174.139.204.221:80
200.241.33.164:80
187.102.64.137:8080
193.131.184.102:3128
193.230.156.89:3128
190.85.86.146:8080
180.96.19.196:3128
91.98.117.29:8080
204.188.215.52:3128
82.207.109.251:3128
222.171.176.109:8080
190.90.158.195:8080
212.55.225.162:8080
177.21.119.9:3128
187.54.67.169:3128
187.18.240.87:3128
82.131.174.21:8080
82.114.82.60:8080
77.123.88.13:3128
200.163.51.83:8080
200.241.69.44:3128
119.115.136.62:8080
190.196.19.157:3128
84.22.3.32:8080
200.223.17.203:8080
190.12.6.212:3128
196.1.120.75:81
219.159.105.180:8080
221.195.42.195:8080
178.48.2.237:8080
66.178.57.4:80
190.249.188.220:8080
190.183.236.134:8080
218.25.249.186:80
173.59.48.13:3128
178.159.250.7:80
201.150.2.170:8080
200.255.18.61:3128
113.230.76.234:443
114.33.112.160:8181
41.189.36.26:3128
113.108.219.46:8080
202.51.116.177:8888
57.90.36.24:80
24.100.137.18:8080
186.195.172.68:80
61.157.217.31:80
157.100.157.154:3128
212.118.23.98:8080


The code is a bit ugly but it works.