Commit 11e330a5 by yexing

u

parent 9fcc7b4f
__pycache__ __pycache__
.vscode .*
celerybeat-* celerybeat-*
.pytest_cache
log log
pid pid
image image
......
...@@ -22,6 +22,18 @@ class Data: ...@@ -22,6 +22,18 @@ class Data:
def inverse_dict(cls): def inverse_dict(cls):
return {v: k for k, v in cls.items()} return {v: k for k, v in cls.items()}
class PropProxy:
def __init__(self, ref: type, prop: str):
self._ref = ref
self._prop = prop
@property
def value(self):
return getattr(self._ref, self._prop)
def __str__(self):
return str(self.value)
class Spider(Data): class Spider(Data):
detail = "task-product-detail" detail = "task-product-detail"
...@@ -67,3 +79,7 @@ class StockStatus(Data): ...@@ -67,3 +79,7 @@ class StockStatus(Data):
class SiteType(Data): class SiteType(Data):
com = 1 com = 1
de = 2 de = 2
it = 3
fr = 4
es = 5
jp = 6
import asyncio
import json import json
import os
import random import random
import re import re
from tenacity import retry, stop_after_attempt, wait_random
import uvicorn
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from curl_cffi.requests import AsyncSession from curl_cffi.requests import Session
from fastapi import FastAPI, HTTPException, Query
from loguru import logger from loguru import logger
from const import Postcode, Site
from db import RedisSingleton
from spider.base_info import InfoSpider
from tool import Task
from conf import config
# from utils.admin_api import callback_cookie # from utils.admin_api import callback_cookie
IS_DEBUG = os.environ.get("IS_DEBUG", False)
REDIS = RedisSingleton(redis_url=config["redis"]["url"])
app = FastAPI()
async def get_headers(): def get_headers():
user_agent = await get_rand_ua() user_agent = get_rand_ua()
return { return {
'authority': 'www.amazon.com', "authority": "www.amazon.com",
'accept': 'text/html,*/*', "accept": "text/html,*/*",
'referer': 'https://www.amazon.com/', "referer": "https://www.amazon.com/",
'user-agent': user_agent, "user-agent": user_agent,
} }
async def get_rand_ua(): def get_rand_ua():
""" """
获取随机UA 获取随机UA
:return: :return:
""" """
version_list = [ version_list = [
"115.0.5790.171", "115.0.5790.110", "115.0.5790.102", "115.0.5790.99", "114.0.5735.199", "115.0.5790.171",
"114.0.5735.134", "114.0.5735.110", "114.0.5735.91", "113.0.5672.127", "113.0.5672.93", "115.0.5790.110",
"113.0.5672.64", "112.0.5615.138", "112.0.5615.121", "112.0.5615.87", "112.0.5615.50", "115.0.5790.102",
"111.0.5563.147", "111.0.5563.111", "111.0.5563.65", "110.0.5481.178", "110.0.5481.104", "115.0.5790.99",
"110.0.5481.100", "110.0.5481.97", "110.0.5481.78", "109.0.5414.120", "109.0.5414.75", "114.0.5735.199",
"108.0.5359.99", "108.0.5359.95", "108.0.5359.72", "107.0.5304.107", "107.0.5304.88", "114.0.5735.134",
"107.0.5304.63", "106.0.5249.119", "106.0.5249.103", "106.0.5249.91", "106.0.5249.62", "114.0.5735.110",
"105.0.5195.127", "105.0.5195.102", "104.0.5112.102", "104.0.5112.81", "103.0.5060.134", "114.0.5735.91",
"103.0.5060.114", "103.0.5060.66", "103.0.5060.53", "102.0.5005.115", "102.0.5005.63", "113.0.5672.127",
"101.0.4951.67", "101.0.4951.64", "101.0.4951.54", "101.0.4951.41", "100.0.4896.127", "113.0.5672.93",
"100.0.4896.88", "100.0.4896.75", "100.0.4896.60", "99.0.4844.84", "99.0.4844.82", "99.0.4844.74", "113.0.5672.64",
"99.0.4844.51", "98.0.4758.102", "98.0.4758.82", "98.0.4758.80", "97.0.4692.99", "97.0.4692.71", "112.0.5615.138",
"96.0.4664.110", "96.0.4664.93", "96.0.4664.45", "95.0.4638.69", "95.0.4638.54", "94.0.4606.81", "112.0.5615.121",
"94.0.4606.71", "94.0.4606.61", "94.0.4606.54", "93.0.4577.82", "93.0.4577.63", "92.0.4515.159", "112.0.5615.87",
"92.0.4515.131", "92.0.4515.107", "91.0.4472.164", "91.0.4472.124", "91.0.4472.114", "112.0.5615.50",
"91.0.4472.106", "91.0.4472.101", "91.0.4472.77", "90.0.4430.212", "90.0.4430.93", "90.0.4430.85", "111.0.5563.147",
"90.0.4430.72"] "111.0.5563.111",
"111.0.5563.65",
"110.0.5481.178",
"110.0.5481.104",
"110.0.5481.100",
"110.0.5481.97",
"110.0.5481.78",
"109.0.5414.120",
"109.0.5414.75",
"108.0.5359.99",
"108.0.5359.95",
"108.0.5359.72",
"107.0.5304.107",
"107.0.5304.88",
"107.0.5304.63",
"106.0.5249.119",
"106.0.5249.103",
"106.0.5249.91",
"106.0.5249.62",
"105.0.5195.127",
"105.0.5195.102",
"104.0.5112.102",
"104.0.5112.81",
"103.0.5060.134",
"103.0.5060.114",
"103.0.5060.66",
"103.0.5060.53",
"102.0.5005.115",
"102.0.5005.63",
"101.0.4951.67",
"101.0.4951.64",
"101.0.4951.54",
"101.0.4951.41",
"100.0.4896.127",
"100.0.4896.88",
"100.0.4896.75",
"100.0.4896.60",
"99.0.4844.84",
"99.0.4844.82",
"99.0.4844.74",
"99.0.4844.51",
"98.0.4758.102",
"98.0.4758.82",
"98.0.4758.80",
"97.0.4692.99",
"97.0.4692.71",
"96.0.4664.110",
"96.0.4664.93",
"96.0.4664.45",
"95.0.4638.69",
"95.0.4638.54",
"94.0.4606.81",
"94.0.4606.71",
"94.0.4606.61",
"94.0.4606.54",
"93.0.4577.82",
"93.0.4577.63",
"92.0.4515.159",
"92.0.4515.131",
"92.0.4515.107",
"91.0.4472.164",
"91.0.4472.124",
"91.0.4472.114",
"91.0.4472.106",
"91.0.4472.101",
"91.0.4472.77",
"90.0.4430.212",
"90.0.4430.93",
"90.0.4430.85",
"90.0.4430.72",
]
windows_list = [ windows_list = [
"Windows NT 10.0; Win64; x64", "Windows NT 10.0; Win64; x64",
"Windows NT 10.0; WOW64", "Windows NT 10.0; WOW64",
...@@ -57,102 +139,125 @@ async def get_rand_ua(): ...@@ -57,102 +139,125 @@ async def get_rand_ua():
return f"Mozilla/5.0 ({window}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{version} Safari/537.36" return f"Mozilla/5.0 ({window}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{version} Safari/537.36"
async def get_proxies(): def get_proxies():
# proxy = "http://127.0.0.1:7890" proxy = "http://127.0.0.1:7890"
# return { return {
# "https": proxy, "https": proxy,
# "http": proxy, "http": proxy,
# } }
return None
@retry(
stop=stop_after_attempt(20),
wait=wait_random(3, 6),
retry_error_callback=lambda *_: ...,
)
def run(zip_code):
inst = Task(REDIS)
if inst.get_loca_cookie(site=Site.com, postcode=zip_code, only_local=True):
return
async def main(zip_code): headers = get_headers()
headers = await get_headers() proxies = get_proxies()
proxies = await get_proxies() with Session() as s:
async with AsyncSession(max_clients=1) as s:
url = "https://www.amazon.com" url = "https://www.amazon.com"
response = await s.get(url, headers=headers, proxies=proxies) response = s.get(url, headers=headers, proxies=proxies)
html = BeautifulSoup(response.text, 'html.parser', from_encoding='utf-8') html = BeautifulSoup(response.text, "html.parser", from_encoding="utf-8")
captcha = html.find('input', id="captchacharacters") captcha = html.find("input", id="captchacharacters")
if captcha: if captcha:
raise Exception("出现验证码了") raise Exception("出现验证码了")
data_modal_action = html.find('span', id='nav-global-location-data-modal-action') data_modal_action = html.find(
"span", id="nav-global-location-data-modal-action"
)
if not data_modal_action: if not data_modal_action:
raise Exception("获取data_modal_action失败") raise Exception("获取data_modal_action失败")
data_modal = data_modal_action.get('data-a-modal') data_modal = data_modal_action.get("data-a-modal")
if data_modal: if data_modal:
data_modal = json.loads(data_modal) data_modal = json.loads(data_modal)
csrf_token = data_modal.get('ajaxHeaders', {}).get('anti-csrftoken-a2z') csrf_token = data_modal.get("ajaxHeaders", {}).get("anti-csrftoken-a2z")
logger.info(f"获取csrf_token成功: {csrf_token}") logger.info(f"获取csrf_token成功: {csrf_token}")
headers['anti-csrftoken-a2z'] = csrf_token headers["anti-csrftoken-a2z"] = csrf_token
url = "https://www.amazon.com/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Gateway&storeContext=NoStoreName&actionSource=desktop-modal" url = "https://www.amazon.com/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Gateway&storeContext=NoStoreName&actionSource=desktop-modal"
response = await s.request("GET", url, headers=headers, proxies=proxies) response = s.request("GET", url, headers=headers, proxies=proxies)
csrf_token = re.findall('CSRF_TOKEN : \"([\s\S]*?)\"', response.text) csrf_token = re.findall('CSRF_TOKEN : "([\s\S]*?)"', response.text)
if len(csrf_token) == 0: if len(csrf_token) == 0:
raise Exception("获取csrf_token失败") raise Exception("获取csrf_token失败")
headers['anti-csrftoken-a2z'] = csrf_token[0] headers["anti-csrftoken-a2z"] = csrf_token[0]
logger.info(f"获取csrf_token成功: {csrf_token[0]}") logger.info(f"获取csrf_token成功: {csrf_token[0]}")
url = "https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow" url = "https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow"
payload = json.dumps({ payload = json.dumps(
{
"locationType": "LOCATION_INPUT", "locationType": "LOCATION_INPUT",
"zipCode": "20001", "zipCode": zip_code,
"deviceType": "web", "deviceType": "web",
"storeContext": "generic", "storeContext": "generic",
"pageType": "Gateway", "pageType": "Gateway",
"actionSource": "glow" "actionSource": "glow",
}) }
)
new_headers = { new_headers = {
'authority': 'www.amazon.com', "authority": "www.amazon.com",
'accept': 'text/html,*/*', "accept": "text/html,*/*",
'origin': 'https://www.amazon.com', "origin": "https://www.amazon.com",
'referer': 'https://www.amazon.com/', "referer": "https://www.amazon.com/",
'x-requested-with': 'XMLHttpRequest', "x-requested-with": "XMLHttpRequest",
'content-type': 'application/json' "content-type": "application/json",
} }
headers.update(new_headers) headers.update(new_headers)
response = await s.request("POST", url, headers=headers, data=payload, proxies=proxies) response = s.request(
"POST", url, headers=headers, data=payload, proxies=proxies
)
address_text = response.text address_text = response.text
logger.info(f"设置邮编返回值: {address_text}") logger.info(f"设置邮编返回值: {address_text}")
if address_text: if address_text:
address_data = json.loads(address_text) address_data = json.loads(address_text)
if address_data.get('address', {}).get('zipCode', '') != zip_code: if address_data.get("address", {}).get("zipCode", "") != zip_code:
raise Exception("邮编验证失败") raise Exception("邮编验证失败")
url = "https://www.amazon.com" url = "https://www.amazon.com"
response = await s.request("GET", url, headers=headers, proxies=proxies) response = s.request("GET", url, headers=headers, proxies=proxies)
html = BeautifulSoup(response.text, 'html.parser', from_encoding='utf-8') html = BeautifulSoup(response.text, "html.parser", from_encoding="utf-8")
data = html.find('span', id="glow-ingress-line2").text data = html.find("span", id="glow-ingress-line2").text
if zip_code not in data: if zip_code not in data:
raise Exception("邮编验证失败") raise Exception("邮编验证失败")
cookies = s.cookies.items() cookies = s.cookies.items()
# 拼接为字符串cookie # 拼接为字符串cookie
cookie = '' cookie = ""
for name, value in cookies: for name, value in cookies:
cookie += '{0}={1};'.format(name, value) cookie += "{0}={1};".format(name, value)
cookie = cookie[:-1] cookie = cookie[:-1]
result = { data = {
"data": {
"cookie": cookie, "cookie": cookie,
"user_agent": headers['user-agent'], "user_agent": headers["user-agent"],
}, }
inst.set_loca_cookie(data, site=Site.com, postcode=zip_code)
result = {
"data": data,
"type": 1, "type": 1,
} }
logger.success(f"设置邮编成功, {json.dumps(result)}") logger.success(f"设置邮编成功, {json.dumps(result)}")
# callback_response = await callback_cookie(result) # callback_response = callback_cookie(result)
# logger.success(f"回调cookie: {json.dumps(callback_response)}") # logger.success(f"回调cookie: {json.dumps(callback_response)}")
await asyncio.sleep(3)
if __name__ == '__main__': @app.get("/query/info")
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) def query_info(
zip_code = "20001" zip_code: str = Query(..., description="邮编"),
while True: url: str = Query(..., description="URL地址"),
):
try: try:
asyncio.run(main(zip_code)) setattr(Postcode, "com", zip_code)
run(zip_code)
return InfoSpider().run({"url": url})
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
raise HTTPException(status_code=500, detail="服务出错了")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=9012)
from __future__ import annotations
import json import json
import re import re
import os
import curl_cffi import curl_cffi
from loguru import logger from loguru import logger
from tenacity import retry, stop_after_attempt, wait_random from tenacity import retry, stop_after_attempt, wait_random
from lxml import etree from lxml import etree
from const import Postcode, Site from const import Postcode, PropProxy, Site
from db import RedisSingleton from db import RedisSingleton
from proxy import ProxyManager from proxy import ProxyManager
from tool import Fmt, Request, Task from tool import Fmt, Request, Task
from conf import config from conf import config
IS_DEBUG = os.environ.get("IS_DEBUG", False)
REDIS = RedisSingleton(redis_url=config["redis"]["url"]) REDIS = RedisSingleton(redis_url=config["redis"]["url"])
task_info_config = config["task-info-detail"] task_info_config = config["task-info-detail"]
class Tool: class Tool:
@staticmethod @staticmethod
def get_amazon_sku(text): def get_amazon_sku(text):
...@@ -95,10 +98,10 @@ class ProxyMixin: ...@@ -95,10 +98,10 @@ class ProxyMixin:
:return: :return:
""" """
# if self.is_debug: if self.is_debug:
# test_proxy = "127.0.0.1:7890" test_proxy = "127.0.0.1:7890"
# proxy = "#1#2#127.0.0.1:7890" proxy = "#1#2#127.0.0.1:7890"
# else: else:
proxy = self.proxy_manager.get_proxy() proxy = self.proxy_manager.get_proxy()
if proxy is None: if proxy is None:
return None return None
...@@ -134,9 +137,9 @@ class ProxyMixin: ...@@ -134,9 +137,9 @@ class ProxyMixin:
redis.incr("amazon:cookie-error") redis.incr("amazon:cookie-error")
class Info(ProxyMixin): class InfoSpider(ProxyMixin):
site = Site.com site = Site.com
postcode = Postcode.com postcode = PropProxy(Postcode, site)
task_manager = Task(REDIS) task_manager = Task(REDIS)
def __init__(self): def __init__(self):
...@@ -152,33 +155,52 @@ class Info(ProxyMixin): ...@@ -152,33 +155,52 @@ class Info(ProxyMixin):
free_delivery = html.xpath( free_delivery = html.xpath(
'//div[@id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/span/text()' '//div[@id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/span/text()'
) )
detail_bullets = html.xpath('//*[@id="productDetails_detailBullets_sections1"]/tbody/tr') ths: list[etree._Element] = html.xpath(
'//*[@id="productDetails_detailBullets_sections1"]/tr/th'
)
tds: list[etree._Element] = html.xpath(
'//*[@id="productDetails_detailBullets_sections1"]/tr/td'
)
detail = {th.text.strip(): td.text.strip() for th, td in zip(ths, tds)}
free_delivery = Fmt.parse_date(free_delivery[0]) if len(free_delivery) else "" free_delivery = Fmt.parse_date(free_delivery[0]) if len(free_delivery) else ""
return { return {
"free_delivery": free_delivery, "free_delivery": free_delivery,
"product_dimensions": detail.get("Product Dimensions", ""),
"item_weight": detail.get("Item Weight", ""),
} }
@retry(
@retry(stop=stop_after_attempt(20), wait=wait_random(3, 6), retry_error_cls=lambda *_:...) stop=stop_after_attempt(20),
def run(self, task): wait=wait_random(3, 6),
retry_error_callback=lambda *_: ...,
)
def run(self, task: dict):
url = task.get("url", "") url = task.get("url", "")
asin = Tool.get_url_asin(url) asin = Tool.get_url_asin(url)
url = f"https://www.amazon.{self.site}/dp/" + asin + "?th=1&psc=1" url = f"https://www.amazon.{self.site}/dp/" + asin + "?th=1&psc=1"
_proxy = self.get_proxy() _proxy = self.get_proxy()
# if IS_DEBUG:
# _proxy = {"proxy": None}
if _proxy is None: if _proxy is None:
raise Exception("没有代理") raise ValueError("没有代理")
try: try:
headers = self.task_manager.get_loca_cookie(site=self.site) headers = self.task_manager.get_loca_cookie(site=self.site, postcode=self.postcode.value)
text = Request.request_html( text = Request.request_html(
url, url,
_proxy["proxy"], _proxy["proxy"],
**{"headers": headers, "timeout": self.request_timeout, "postcode": self.postcode}, **{
"headers": headers,
"timeout": self.request_timeout,
"postcode": self.postcode.value,
},
) )
response = self.format_content(text) response = self.format_content(text)
logger.debug(response)
return response return response
except curl_cffi.curl.CurlError: except curl_cffi.curl.CurlError:
logger.error(f"请求超时: {url}") logger.error(f"请求超时: {url}")
raise
except Exception as e: except Exception as e:
if str(e) == "出现验证码": if str(e) == "出现验证码":
self.delete_proxy(_proxy["temp_proxy"]) self.delete_proxy(_proxy["temp_proxy"])
...@@ -186,6 +208,6 @@ class Info(ProxyMixin): ...@@ -186,6 +208,6 @@ class Info(ProxyMixin):
if str(e) == "采集邮编错误": if str(e) == "采集邮编错误":
self.cookie_error() self.cookie_error()
logger.error(f"请求异常: {e} - {url}") logger.error(f"请求异常: {e} - {url}")
raise
finally: finally:
self.join_proxy(_proxy["temp_proxy"]) self.join_proxy(_proxy["temp_proxy"])
...@@ -14,8 +14,9 @@ from loguru import logger ...@@ -14,8 +14,9 @@ from loguru import logger
from lxml import etree from lxml import etree
from conf import config from conf import config
from const import Lang, StockStatus from const import Lang, Site, StockStatus
from const import SiteType from const import SiteType
from db import RedisSingleton
DOMAIN = config["app"]["domain"] DOMAIN = config["app"]["domain"]
COOKIE = config["cookie"] COOKIE = config["cookie"]
...@@ -23,7 +24,7 @@ IMPERSONATE = config["app"]["impersonate"] ...@@ -23,7 +24,7 @@ IMPERSONATE = config["app"]["impersonate"]
class Task: class Task:
def __init__(self, redis_singleton): def __init__(self, redis_singleton: RedisSingleton):
self.redis_singleton = redis_singleton self.redis_singleton = redis_singleton
def get_task(self, task_key: str = "", batch_size: int = 10): def get_task(self, task_key: str = "", batch_size: int = 10):
...@@ -120,20 +121,33 @@ class Task: ...@@ -120,20 +121,33 @@ class Task:
redis_client.delete(time_key) redis_client.delete(time_key)
return cookie return cookie
def get_loca_cookie(self, site: str = "com"): def get_loca_cookie(self, site: str = Site.com, postcode: str = None, only_local: bool = False):
""" """
获取本地cookie 获取本地cookie
:return: :return:
""" """
redis_client = self.redis_singleton.get_connection() redis_client = self.redis_singleton.get_connection()
cookie = redis_client.get(f"cookie:{site}") key = f"cookie:{site}"
if postcode:
key += f":{postcode}"
cookie = redis_client.get(key)
if only_local:
return cookie
if not cookie: if not cookie:
cookie = self.get_cookie(site) cookie = self.get_cookie(site)
if isinstance(cookie, dict): if isinstance(cookie, dict):
return cookie return cookie
return json.loads(cookie) return json.loads(cookie)
def set_loca_cookie(self, data: dict, site: str = Site.com, postcode: str = None):
redis_client = self.redis_singleton.get_connection()
key = f"cookie:{site}"
if postcode:
key += f":{postcode}"
redis_client.set(key, json.dumps(data))
class Request: class Request:
@staticmethod @staticmethod
...@@ -189,7 +203,7 @@ class Request: ...@@ -189,7 +203,7 @@ class Request:
is_link_error = html.xpath('//div[@id="g"]/a/@href') is_link_error = html.xpath('//div[@id="g"]/a/@href')
title = Tool.get_title(html) title = Tool.get_title(html)
if len(is_link_error) == 0 and len(title) == 0 and is_product_detail: if len(is_link_error) == 0 and len(title) == 0 and is_product_detail:
raise Exception(f"采集内容有误") raise Exception("采集内容有误")
return text return text
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment