Commit 11e330a5 by yexing

u

parent 9fcc7b4f
__pycache__
.vscode
.*
celerybeat-*
.pytest_cache
log
pid
image
......
......@@ -22,6 +22,18 @@ class Data:
def inverse_dict(cls):
return {v: k for k, v in cls.items()}
class PropProxy:
def __init__(self, ref: type, prop: str):
self._ref = ref
self._prop = prop
@property
def value(self):
return getattr(self._ref, self._prop)
def __str__(self):
return str(self.value)
class Spider(Data):
detail = "task-product-detail"
......@@ -67,3 +79,7 @@ class StockStatus(Data):
class SiteType(Data):
com = 1
de = 2
it = 3
fr = 4
es = 5
jp = 6
import asyncio
import json
import os
import random
import re
from tenacity import retry, stop_after_attempt, wait_random
import uvicorn
from bs4 import BeautifulSoup
from curl_cffi.requests import AsyncSession
from curl_cffi.requests import Session
from fastapi import FastAPI, HTTPException, Query
from loguru import logger
from const import Postcode, Site
from db import RedisSingleton
from spider.base_info import InfoSpider
from tool import Task
from conf import config
# from utils.admin_api import callback_cookie
IS_DEBUG = os.environ.get("IS_DEBUG", False)
REDIS = RedisSingleton(redis_url=config["redis"]["url"])
app = FastAPI()
async def get_headers():
user_agent = await get_rand_ua()
def get_headers():
user_agent = get_rand_ua()
return {
'authority': 'www.amazon.com',
'accept': 'text/html,*/*',
'referer': 'https://www.amazon.com/',
'user-agent': user_agent,
"authority": "www.amazon.com",
"accept": "text/html,*/*",
"referer": "https://www.amazon.com/",
"user-agent": user_agent,
}
async def get_rand_ua():
def get_rand_ua():
"""
获取随机UA
:return:
"""
version_list = [
"115.0.5790.171", "115.0.5790.110", "115.0.5790.102", "115.0.5790.99", "114.0.5735.199",
"114.0.5735.134", "114.0.5735.110", "114.0.5735.91", "113.0.5672.127", "113.0.5672.93",
"113.0.5672.64", "112.0.5615.138", "112.0.5615.121", "112.0.5615.87", "112.0.5615.50",
"111.0.5563.147", "111.0.5563.111", "111.0.5563.65", "110.0.5481.178", "110.0.5481.104",
"110.0.5481.100", "110.0.5481.97", "110.0.5481.78", "109.0.5414.120", "109.0.5414.75",
"108.0.5359.99", "108.0.5359.95", "108.0.5359.72", "107.0.5304.107", "107.0.5304.88",
"107.0.5304.63", "106.0.5249.119", "106.0.5249.103", "106.0.5249.91", "106.0.5249.62",
"105.0.5195.127", "105.0.5195.102", "104.0.5112.102", "104.0.5112.81", "103.0.5060.134",
"103.0.5060.114", "103.0.5060.66", "103.0.5060.53", "102.0.5005.115", "102.0.5005.63",
"101.0.4951.67", "101.0.4951.64", "101.0.4951.54", "101.0.4951.41", "100.0.4896.127",
"100.0.4896.88", "100.0.4896.75", "100.0.4896.60", "99.0.4844.84", "99.0.4844.82", "99.0.4844.74",
"99.0.4844.51", "98.0.4758.102", "98.0.4758.82", "98.0.4758.80", "97.0.4692.99", "97.0.4692.71",
"96.0.4664.110", "96.0.4664.93", "96.0.4664.45", "95.0.4638.69", "95.0.4638.54", "94.0.4606.81",
"94.0.4606.71", "94.0.4606.61", "94.0.4606.54", "93.0.4577.82", "93.0.4577.63", "92.0.4515.159",
"92.0.4515.131", "92.0.4515.107", "91.0.4472.164", "91.0.4472.124", "91.0.4472.114",
"91.0.4472.106", "91.0.4472.101", "91.0.4472.77", "90.0.4430.212", "90.0.4430.93", "90.0.4430.85",
"90.0.4430.72"]
"115.0.5790.171",
"115.0.5790.110",
"115.0.5790.102",
"115.0.5790.99",
"114.0.5735.199",
"114.0.5735.134",
"114.0.5735.110",
"114.0.5735.91",
"113.0.5672.127",
"113.0.5672.93",
"113.0.5672.64",
"112.0.5615.138",
"112.0.5615.121",
"112.0.5615.87",
"112.0.5615.50",
"111.0.5563.147",
"111.0.5563.111",
"111.0.5563.65",
"110.0.5481.178",
"110.0.5481.104",
"110.0.5481.100",
"110.0.5481.97",
"110.0.5481.78",
"109.0.5414.120",
"109.0.5414.75",
"108.0.5359.99",
"108.0.5359.95",
"108.0.5359.72",
"107.0.5304.107",
"107.0.5304.88",
"107.0.5304.63",
"106.0.5249.119",
"106.0.5249.103",
"106.0.5249.91",
"106.0.5249.62",
"105.0.5195.127",
"105.0.5195.102",
"104.0.5112.102",
"104.0.5112.81",
"103.0.5060.134",
"103.0.5060.114",
"103.0.5060.66",
"103.0.5060.53",
"102.0.5005.115",
"102.0.5005.63",
"101.0.4951.67",
"101.0.4951.64",
"101.0.4951.54",
"101.0.4951.41",
"100.0.4896.127",
"100.0.4896.88",
"100.0.4896.75",
"100.0.4896.60",
"99.0.4844.84",
"99.0.4844.82",
"99.0.4844.74",
"99.0.4844.51",
"98.0.4758.102",
"98.0.4758.82",
"98.0.4758.80",
"97.0.4692.99",
"97.0.4692.71",
"96.0.4664.110",
"96.0.4664.93",
"96.0.4664.45",
"95.0.4638.69",
"95.0.4638.54",
"94.0.4606.81",
"94.0.4606.71",
"94.0.4606.61",
"94.0.4606.54",
"93.0.4577.82",
"93.0.4577.63",
"92.0.4515.159",
"92.0.4515.131",
"92.0.4515.107",
"91.0.4472.164",
"91.0.4472.124",
"91.0.4472.114",
"91.0.4472.106",
"91.0.4472.101",
"91.0.4472.77",
"90.0.4430.212",
"90.0.4430.93",
"90.0.4430.85",
"90.0.4430.72",
]
windows_list = [
"Windows NT 10.0; Win64; x64",
"Windows NT 10.0; WOW64",
......@@ -57,102 +139,125 @@ async def get_rand_ua():
return f"Mozilla/5.0 ({window}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{version} Safari/537.36"
async def get_proxies():
# proxy = "http://127.0.0.1:7890"
# return {
# "https": proxy,
# "http": proxy,
# }
return None
def get_proxies():
proxy = "http://127.0.0.1:7890"
return {
"https": proxy,
"http": proxy,
}
@retry(
stop=stop_after_attempt(20),
wait=wait_random(3, 6),
retry_error_callback=lambda *_: ...,
)
def run(zip_code):
inst = Task(REDIS)
if inst.get_loca_cookie(site=Site.com, postcode=zip_code, only_local=True):
return
async def main(zip_code):
headers = await get_headers()
proxies = await get_proxies()
async with AsyncSession(max_clients=1) as s:
headers = get_headers()
proxies = get_proxies()
with Session() as s:
url = "https://www.amazon.com"
response = await s.get(url, headers=headers, proxies=proxies)
html = BeautifulSoup(response.text, 'html.parser', from_encoding='utf-8')
captcha = html.find('input', id="captchacharacters")
response = s.get(url, headers=headers, proxies=proxies)
html = BeautifulSoup(response.text, "html.parser", from_encoding="utf-8")
captcha = html.find("input", id="captchacharacters")
if captcha:
raise Exception("出现验证码了")
data_modal_action = html.find('span', id='nav-global-location-data-modal-action')
data_modal_action = html.find(
"span", id="nav-global-location-data-modal-action"
)
if not data_modal_action:
raise Exception("获取data_modal_action失败")
data_modal = data_modal_action.get('data-a-modal')
data_modal = data_modal_action.get("data-a-modal")
if data_modal:
data_modal = json.loads(data_modal)
csrf_token = data_modal.get('ajaxHeaders', {}).get('anti-csrftoken-a2z')
csrf_token = data_modal.get("ajaxHeaders", {}).get("anti-csrftoken-a2z")
logger.info(f"获取csrf_token成功: {csrf_token}")
headers['anti-csrftoken-a2z'] = csrf_token
headers["anti-csrftoken-a2z"] = csrf_token
url = "https://www.amazon.com/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Gateway&storeContext=NoStoreName&actionSource=desktop-modal"
response = await s.request("GET", url, headers=headers, proxies=proxies)
csrf_token = re.findall('CSRF_TOKEN : \"([\s\S]*?)\"', response.text)
response = s.request("GET", url, headers=headers, proxies=proxies)
csrf_token = re.findall('CSRF_TOKEN : "([\s\S]*?)"', response.text)
if len(csrf_token) == 0:
raise Exception("获取csrf_token失败")
headers['anti-csrftoken-a2z'] = csrf_token[0]
headers["anti-csrftoken-a2z"] = csrf_token[0]
logger.info(f"获取csrf_token成功: {csrf_token[0]}")
url = "https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow"
payload = json.dumps({
payload = json.dumps(
{
"locationType": "LOCATION_INPUT",
"zipCode": "20001",
"zipCode": zip_code,
"deviceType": "web",
"storeContext": "generic",
"pageType": "Gateway",
"actionSource": "glow"
})
"actionSource": "glow",
}
)
new_headers = {
'authority': 'www.amazon.com',
'accept': 'text/html,*/*',
'origin': 'https://www.amazon.com',
'referer': 'https://www.amazon.com/',
'x-requested-with': 'XMLHttpRequest',
'content-type': 'application/json'
"authority": "www.amazon.com",
"accept": "text/html,*/*",
"origin": "https://www.amazon.com",
"referer": "https://www.amazon.com/",
"x-requested-with": "XMLHttpRequest",
"content-type": "application/json",
}
headers.update(new_headers)
response = await s.request("POST", url, headers=headers, data=payload, proxies=proxies)
response = s.request(
"POST", url, headers=headers, data=payload, proxies=proxies
)
address_text = response.text
logger.info(f"设置邮编返回值: {address_text}")
if address_text:
address_data = json.loads(address_text)
if address_data.get('address', {}).get('zipCode', '') != zip_code:
if address_data.get("address", {}).get("zipCode", "") != zip_code:
raise Exception("邮编验证失败")
url = "https://www.amazon.com"
response = await s.request("GET", url, headers=headers, proxies=proxies)
html = BeautifulSoup(response.text, 'html.parser', from_encoding='utf-8')
data = html.find('span', id="glow-ingress-line2").text
response = s.request("GET", url, headers=headers, proxies=proxies)
html = BeautifulSoup(response.text, "html.parser", from_encoding="utf-8")
data = html.find("span", id="glow-ingress-line2").text
if zip_code not in data:
raise Exception("邮编验证失败")
cookies = s.cookies.items()
# 拼接为字符串cookie
cookie = ''
cookie = ""
for name, value in cookies:
cookie += '{0}={1};'.format(name, value)
cookie += "{0}={1};".format(name, value)
cookie = cookie[:-1]
result = {
"data": {
data = {
"cookie": cookie,
"user_agent": headers['user-agent'],
},
"user_agent": headers["user-agent"],
}
inst.set_loca_cookie(data, site=Site.com, postcode=zip_code)
result = {
"data": data,
"type": 1,
}
logger.success(f"设置邮编成功, {json.dumps(result)}")
# callback_response = await callback_cookie(result)
# callback_response = callback_cookie(result)
# logger.success(f"回调cookie: {json.dumps(callback_response)}")
await asyncio.sleep(3)
if __name__ == '__main__':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
zip_code = "20001"
while True:
@app.get("/query/info")
def query_info(
zip_code: str = Query(..., description="邮编"),
url: str = Query(..., description="URL地址"),
):
try:
asyncio.run(main(zip_code))
setattr(Postcode, "com", zip_code)
run(zip_code)
return InfoSpider().run({"url": url})
except Exception as e:
logger.error(e)
raise HTTPException(status_code=500, detail="服务出错了")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=9012)
from __future__ import annotations
import json
import re
import os
import curl_cffi
from loguru import logger
from tenacity import retry, stop_after_attempt, wait_random
from lxml import etree
from const import Postcode, Site
from const import Postcode, PropProxy, Site
from db import RedisSingleton
from proxy import ProxyManager
from tool import Fmt, Request, Task
from conf import config
IS_DEBUG = os.environ.get("IS_DEBUG", False)
REDIS = RedisSingleton(redis_url=config["redis"]["url"])
task_info_config = config["task-info-detail"]
class Tool:
@staticmethod
def get_amazon_sku(text):
......@@ -95,10 +98,10 @@ class ProxyMixin:
:return:
"""
# if self.is_debug:
# test_proxy = "127.0.0.1:7890"
# proxy = "#1#2#127.0.0.1:7890"
# else:
if self.is_debug:
test_proxy = "127.0.0.1:7890"
proxy = "#1#2#127.0.0.1:7890"
else:
proxy = self.proxy_manager.get_proxy()
if proxy is None:
return None
......@@ -134,9 +137,9 @@ class ProxyMixin:
redis.incr("amazon:cookie-error")
class Info(ProxyMixin):
class InfoSpider(ProxyMixin):
site = Site.com
postcode = Postcode.com
postcode = PropProxy(Postcode, site)
task_manager = Task(REDIS)
def __init__(self):
......@@ -152,33 +155,52 @@ class Info(ProxyMixin):
free_delivery = html.xpath(
'//div[@id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/span/text()'
)
detail_bullets = html.xpath('//*[@id="productDetails_detailBullets_sections1"]/tbody/tr')
ths: list[etree._Element] = html.xpath(
'//*[@id="productDetails_detailBullets_sections1"]/tr/th'
)
tds: list[etree._Element] = html.xpath(
'//*[@id="productDetails_detailBullets_sections1"]/tr/td'
)
detail = {th.text.strip(): td.text.strip() for th, td in zip(ths, tds)}
free_delivery = Fmt.parse_date(free_delivery[0]) if len(free_delivery) else ""
return {
"free_delivery": free_delivery,
"product_dimensions": detail.get("Product Dimensions", ""),
"item_weight": detail.get("Item Weight", ""),
}
@retry(stop=stop_after_attempt(20), wait=wait_random(3, 6), retry_error_cls=lambda *_:...)
def run(self, task):
@retry(
stop=stop_after_attempt(20),
wait=wait_random(3, 6),
retry_error_callback=lambda *_: ...,
)
def run(self, task: dict):
url = task.get("url", "")
asin = Tool.get_url_asin(url)
url = f"https://www.amazon.{self.site}/dp/" + asin + "?th=1&psc=1"
_proxy = self.get_proxy()
# if IS_DEBUG:
# _proxy = {"proxy": None}
if _proxy is None:
raise Exception("没有代理")
raise ValueError("没有代理")
try:
headers = self.task_manager.get_loca_cookie(site=self.site)
headers = self.task_manager.get_loca_cookie(site=self.site, postcode=self.postcode.value)
text = Request.request_html(
url,
_proxy["proxy"],
**{"headers": headers, "timeout": self.request_timeout, "postcode": self.postcode},
**{
"headers": headers,
"timeout": self.request_timeout,
"postcode": self.postcode.value,
},
)
response = self.format_content(text)
logger.debug(response)
return response
except curl_cffi.curl.CurlError:
logger.error(f"请求超时: {url}")
raise
except Exception as e:
if str(e) == "出现验证码":
self.delete_proxy(_proxy["temp_proxy"])
......@@ -186,6 +208,6 @@ class Info(ProxyMixin):
if str(e) == "采集邮编错误":
self.cookie_error()
logger.error(f"请求异常: {e} - {url}")
raise
finally:
self.join_proxy(_proxy["temp_proxy"])
......@@ -14,8 +14,9 @@ from loguru import logger
from lxml import etree
from conf import config
from const import Lang, StockStatus
from const import Lang, Site, StockStatus
from const import SiteType
from db import RedisSingleton
DOMAIN = config["app"]["domain"]
COOKIE = config["cookie"]
......@@ -23,7 +24,7 @@ IMPERSONATE = config["app"]["impersonate"]
class Task:
def __init__(self, redis_singleton):
def __init__(self, redis_singleton: RedisSingleton):
self.redis_singleton = redis_singleton
def get_task(self, task_key: str = "", batch_size: int = 10):
......@@ -120,20 +121,33 @@ class Task:
redis_client.delete(time_key)
return cookie
def get_loca_cookie(self, site: str = "com"):
def get_loca_cookie(self, site: str = Site.com, postcode: str = None, only_local: bool = False):
"""
获取本地cookie
:return:
"""
redis_client = self.redis_singleton.get_connection()
cookie = redis_client.get(f"cookie:{site}")
key = f"cookie:{site}"
if postcode:
key += f":{postcode}"
cookie = redis_client.get(key)
if only_local:
return cookie
if not cookie:
cookie = self.get_cookie(site)
if isinstance(cookie, dict):
return cookie
return json.loads(cookie)
def set_loca_cookie(self, data: dict, site: str = Site.com, postcode: str = None):
redis_client = self.redis_singleton.get_connection()
key = f"cookie:{site}"
if postcode:
key += f":{postcode}"
redis_client.set(key, json.dumps(data))
class Request:
@staticmethod
......@@ -189,7 +203,7 @@ class Request:
is_link_error = html.xpath('//div[@id="g"]/a/@href')
title = Tool.get_title(html)
if len(is_link_error) == 0 and len(title) == 0 and is_product_detail:
raise Exception(f"采集内容有误")
raise Exception("采集内容有误")
return text
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment