你知唔知我系靓仔
验证代理可用性
因为爬取的代理的可用性较低
大概都在几分钟,所以需要在使用前进行验证
输出格式为
socks4://1.1.1.1:1080|VN|1.1.1.1
socks5://1.1.1.1:7891|CN|1.1.1.1
http://1.1.1.1:4003|VN|1.1.1.1
支持http/https/socks4/socks5
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
# 增加连接超时和请求超时时间
TIMEOUT = 15
def validate_proxy(proxy):
proxies = {"http": proxy, "https": proxy}
try:
response = requests.get("https://ipinfo.io/", proxies=proxies, timeout=TIMEOUT)
response.raise_for_status()
ip_info = response.json()
country_code = ip_info.get("country", "N/A")
ip_address = ip_info.get("ip", "N/A")
return f"{proxy}|{country_code}|{ip_address}"
except requests.RequestException:
return None
def validate_proxies(proxies):
validated_proxies = []
with ThreadPoolExecutor(max_workers=50) as executor:
futures = [executor.submit(validate_proxy, proxy.strip()) for proxy in proxies]
for future in tqdm(futures, total=len(proxies), desc="Validating Proxies", unit="proxy"):
result = future.result()
if result:
validated_proxies.append(result)
# 实时保存验证成功的代理信息到ok.txt
with open("ok.txt", "a") as file:
file.write(result + "\n")
return validated_proxies
def main():
# 读取proxy.txt并去重
with open("proxy.txt", "r") as file:
proxies = list(set(file.readlines()))
validated_proxies = validate_proxies(proxies)
if __name__ == "__main__":
main()