Commit 767321a7 by yexing

u

parent 58d033ad
import asyncio
import json
import random
import re
from bs4 import BeautifulSoup
from curl_cffi.requests import AsyncSession
from loguru import logger
# from utils.admin_api import callback_cookie
async def get_headers():
user_agent = await get_rand_ua()
return {
'authority': 'www.amazon.com',
'accept': 'text/html,*/*',
'referer': 'https://www.amazon.com/',
'user-agent': user_agent,
}
async def get_rand_ua():
"""
获取随机UA
:return:
"""
version_list = [
"115.0.5790.171", "115.0.5790.110", "115.0.5790.102", "115.0.5790.99", "114.0.5735.199",
"114.0.5735.134", "114.0.5735.110", "114.0.5735.91", "113.0.5672.127", "113.0.5672.93",
"113.0.5672.64", "112.0.5615.138", "112.0.5615.121", "112.0.5615.87", "112.0.5615.50",
"111.0.5563.147", "111.0.5563.111", "111.0.5563.65", "110.0.5481.178", "110.0.5481.104",
"110.0.5481.100", "110.0.5481.97", "110.0.5481.78", "109.0.5414.120", "109.0.5414.75",
"108.0.5359.99", "108.0.5359.95", "108.0.5359.72", "107.0.5304.107", "107.0.5304.88",
"107.0.5304.63", "106.0.5249.119", "106.0.5249.103", "106.0.5249.91", "106.0.5249.62",
"105.0.5195.127", "105.0.5195.102", "104.0.5112.102", "104.0.5112.81", "103.0.5060.134",
"103.0.5060.114", "103.0.5060.66", "103.0.5060.53", "102.0.5005.115", "102.0.5005.63",
"101.0.4951.67", "101.0.4951.64", "101.0.4951.54", "101.0.4951.41", "100.0.4896.127",
"100.0.4896.88", "100.0.4896.75", "100.0.4896.60", "99.0.4844.84", "99.0.4844.82", "99.0.4844.74",
"99.0.4844.51", "98.0.4758.102", "98.0.4758.82", "98.0.4758.80", "97.0.4692.99", "97.0.4692.71",
"96.0.4664.110", "96.0.4664.93", "96.0.4664.45", "95.0.4638.69", "95.0.4638.54", "94.0.4606.81",
"94.0.4606.71", "94.0.4606.61", "94.0.4606.54", "93.0.4577.82", "93.0.4577.63", "92.0.4515.159",
"92.0.4515.131", "92.0.4515.107", "91.0.4472.164", "91.0.4472.124", "91.0.4472.114",
"91.0.4472.106", "91.0.4472.101", "91.0.4472.77", "90.0.4430.212", "90.0.4430.93", "90.0.4430.85",
"90.0.4430.72"]
windows_list = [
"Windows NT 10.0; Win64; x64",
"Windows NT 10.0; WOW64",
"Windows NT 6.3; Win64; x64",
"Windows NT 6.3; WOW64",
"Windows NT 6.2; Win64; x64",
"Windows NT 6.2; WOW64",
]
version = random.choice(version_list)
window = random.choice(windows_list)
return f"Mozilla/5.0 ({window}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{version} Safari/537.36"
async def get_proxies():
# proxy = "http://127.0.0.1:7890"
# return {
# "https": proxy,
# "http": proxy,
# }
return None
async def main(zip_code):
headers = await get_headers()
proxies = await get_proxies()
async with AsyncSession(max_clients=1) as s:
url = "https://www.amazon.com"
response = await s.get(url, headers=headers, proxies=proxies)
html = BeautifulSoup(response.text, 'html.parser', from_encoding='utf-8')
captcha = html.find('input', id="captchacharacters")
if captcha:
raise Exception("出现验证码了")
data_modal_action = html.find('span', id='nav-global-location-data-modal-action')
if not data_modal_action:
raise Exception("获取data_modal_action失败")
data_modal = data_modal_action.get('data-a-modal')
if data_modal:
data_modal = json.loads(data_modal)
csrf_token = data_modal.get('ajaxHeaders', {}).get('anti-csrftoken-a2z')
logger.info(f"获取csrf_token成功: {csrf_token}")
headers['anti-csrftoken-a2z'] = csrf_token
url = "https://www.amazon.com/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Gateway&storeContext=NoStoreName&actionSource=desktop-modal"
response = await s.request("GET", url, headers=headers, proxies=proxies)
csrf_token = re.findall('CSRF_TOKEN : \"([\s\S]*?)\"', response.text)
if len(csrf_token) == 0:
raise Exception("获取csrf_token失败")
headers['anti-csrftoken-a2z'] = csrf_token[0]
logger.info(f"获取csrf_token成功: {csrf_token[0]}")
url = "https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow"
payload = json.dumps({
"locationType": "LOCATION_INPUT",
"zipCode": "20001",
"deviceType": "web",
"storeContext": "generic",
"pageType": "Gateway",
"actionSource": "glow"
})
new_headers = {
'authority': 'www.amazon.com',
'accept': 'text/html,*/*',
'origin': 'https://www.amazon.com',
'referer': 'https://www.amazon.com/',
'x-requested-with': 'XMLHttpRequest',
'content-type': 'application/json'
}
headers.update(new_headers)
response = await s.request("POST", url, headers=headers, data=payload, proxies=proxies)
address_text = response.text
logger.info(f"设置邮编返回值: {address_text}")
if address_text:
address_data = json.loads(address_text)
if address_data.get('address', {}).get('zipCode', '') != zip_code:
raise Exception("邮编验证失败")
url = "https://www.amazon.com"
response = await s.request("GET", url, headers=headers, proxies=proxies)
html = BeautifulSoup(response.text, 'html.parser', from_encoding='utf-8')
data = html.find('span', id="glow-ingress-line2").text
if zip_code not in data:
raise Exception("邮编验证失败")
cookies = s.cookies.items()
# 拼接为字符串cookie
cookie = ''
for name, value in cookies:
cookie += '{0}={1};'.format(name, value)
cookie = cookie[:-1]
result = {
"data": {
"cookie": cookie,
"user_agent": headers['user-agent'],
},
"type": 1,
}
logger.success(f"设置邮编成功, {json.dumps(result)}")
# callback_response = await callback_cookie(result)
# logger.success(f"回调cookie: {json.dumps(callback_response)}")
await asyncio.sleep(3)
if __name__ == '__main__':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
zip_code = "20001"
while True:
try:
asyncio.run(main(zip_code))
except Exception as e:
logger.error(e)
......@@ -19,7 +19,6 @@ from db import RedisSingleton
from proxy import ProxyManager
from tool import Fmt, Task, Request
redis_config = config["redis"]
task_monitoring_config = config["task-product-detail"]
redis_singleton = RedisSingleton(redis_url=config["redis"]["url"])
......
import json
import re
from const import Postcode, Site
from db import RedisSingleton
from proxy import ProxyManager
from tool import Request, Task
from conf import config
REDIS = RedisSingleton(redis_url=config["redis"]["url"])
task_monitoring_config = config["task-product-detail"]
class Tool:
@staticmethod
def get_amazon_sku(text):
"""
获取amazon_sku
:param text:
:return:
"""
amazon_skus = re.findall(r"dimensionValuesDisplayData\" : ({[\s\S]*?}),", text)
if len(amazon_skus) == 0:
amazon_skus = re.findall(r"dimensionValuesData\": ({[\s\S]*?}),", text)
if len(amazon_skus):
amazon_skus = json.loads(amazon_skus[0])
return amazon_skus
@staticmethod
def get_url_asin(url: str):
patterns = ["dp/(.+?)\?", "dp/(.+?)\/", "dp/(.+)"]
asin = ""
for pattern in patterns:
asin = re.findall(pattern, url)
if len(asin):
asin = asin[0]
if len(asin) > 10:
asin = asin.split("/")[0]
break
return asin
@staticmethod
def get_book_asin(url: str, asin: str):
"""
获取图书的asin
:param url:
:param asin:
:return:
"""
if url == "javascript:void(0)":
return asin
asin = re.findall(r"/dp/(\w+)", url)
if not asin:
return ""
return asin[0]
@staticmethod
def get_title(html):
"""
获取标题
:param html:
:return:
"""
title = html.xpath('//span[@id="productTitle"]/text()')
if len(title) == 0:
title = html.xpath('//span[@id="bond-title-desktop"]/text()')
return title
@staticmethod
def get_data_json(text):
"""
获取data_json
:param text:
:return:
"""
data_json = re.findall("jQuery.parseJSON\('(.*)'\)", text)
data_json = data_json[0].replace("\\'", "'")
return json.loads(data_json)
class ProxyMixin:
proxy_manager = ProxyManager(REDIS)
def get_proxy(self):
"""
:return:
"""
# if self.is_debug:
# test_proxy = "127.0.0.1:7890"
# proxy = "#1#2#127.0.0.1:7890"
# else:
proxy = self.proxy_manager.get_proxy()
if proxy is None:
return None
test_proxy = proxy.split("#")[0]
return {
"proxy": f"chensav:chensav@{test_proxy}",
"temp_proxy": proxy,
}
def join_proxy(self, proxy):
"""
加入代理
:param proxy:
:return:
"""
return self.proxy_manager.join_proxy(proxy)
def delete_proxy(self, proxy):
"""
删除代理
:param proxy:
:return:
"""
proxy_name = proxy.split("#")[1]
return self.proxy_manager.delete_proxy(proxy_name)
def cookie_error(self):
"""
cookie错误
:return:
"""
redis = REDIS.get_connection()
redis.incr("amazon:cookie-error")
class Info(ProxyMixin):
site = Site.com
postcode = Postcode.com
task_manager = Task(REDIS)
def __init__(self):
self.task_key = task_monitoring_config["task_key"]
self.item_key = task_monitoring_config["item_key"]
self.task_number = int(task_monitoring_config["task_number"])
self.enabled = task_monitoring_config["enabled"] == "True"
self.request_timeout = int(task_monitoring_config["request_timeout"])
self.is_debug = task_monitoring_config["is_debug"] == "True"
def run(self, task):
url = task.get("url", "")
asin = Tool.get_url_asin(url)
url = f"https://www.amazon.{self.site}/dp/" + asin + "?th=1&psc=1"
_proxy = self.get_proxy()
headers = self.task_manager.get_loca_cookie(site=self.site)
Request.request_html(
url,
_proxy["proxy"],
**{
"headers": headers,
"timeout": self.request_timeout,
"postcode": self.postcode,
},
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment