Commit 11e330a5 by yexing

u

parent 9fcc7b4f
__pycache__
.vscode
.*
celerybeat-*
.pytest_cache
log
pid
image
......
......@@ -21,6 +21,18 @@ class Data:
@classmethod
def inverse_dict(cls):
return {v: k for k, v in cls.items()}
class PropProxy:
def __init__(self, ref: type, prop: str):
self._ref = ref
self._prop = prop
@property
def value(self):
return getattr(self._ref, self._prop)
def __str__(self):
return str(self.value)
class Spider(Data):
......@@ -66,4 +78,8 @@ class StockStatus(Data):
class SiteType(Data):
com = 1
de = 2
\ No newline at end of file
de = 2
it = 3
fr = 4
es = 5
jp = 6
from __future__ import annotations
import json
import re
import os
import curl_cffi
from loguru import logger
from tenacity import retry, stop_after_attempt, wait_random
from lxml import etree
from const import Postcode, Site
from const import Postcode, PropProxy, Site
from db import RedisSingleton
from proxy import ProxyManager
from tool import Fmt, Request, Task
from conf import config
IS_DEBUG = os.environ.get("IS_DEBUG", False)
REDIS = RedisSingleton(redis_url=config["redis"]["url"])
task_info_config = config["task-info-detail"]
class Tool:
@staticmethod
def get_amazon_sku(text):
......@@ -95,14 +98,14 @@ class ProxyMixin:
:return:
"""
# if self.is_debug:
# test_proxy = "127.0.0.1:7890"
# proxy = "#1#2#127.0.0.1:7890"
# else:
proxy = self.proxy_manager.get_proxy()
if proxy is None:
return None
test_proxy = proxy.split("#")[0]
if self.is_debug:
test_proxy = "127.0.0.1:7890"
proxy = "#1#2#127.0.0.1:7890"
else:
proxy = self.proxy_manager.get_proxy()
if proxy is None:
return None
test_proxy = proxy.split("#")[0]
return {
"proxy": f"chensav:chensav@{test_proxy}",
"temp_proxy": proxy,
......@@ -134,11 +137,11 @@ class ProxyMixin:
redis.incr("amazon:cookie-error")
class Info(ProxyMixin):
class InfoSpider(ProxyMixin):
site = Site.com
postcode = Postcode.com
postcode = PropProxy(Postcode, site)
task_manager = Task(REDIS)
def __init__(self):
self.task_key = task_info_config["task_key"]
self.item_key = task_info_config["item_key"]
......@@ -146,39 +149,58 @@ class Info(ProxyMixin):
self.enabled = task_info_config["enabled"] == "True"
self.request_timeout = int(task_info_config["request_timeout"])
self.is_debug = task_info_config["is_debug"] == "True"
def format_content(self, text):
html = etree.HTML(text)
free_delivery = html.xpath(
'//div[@id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/span/text()'
)
detail_bullets = html.xpath('//*[@id="productDetails_detailBullets_sections1"]/tbody/tr')
ths: list[etree._Element] = html.xpath(
'//*[@id="productDetails_detailBullets_sections1"]/tr/th'
)
tds: list[etree._Element] = html.xpath(
'//*[@id="productDetails_detailBullets_sections1"]/tr/td'
)
detail = {th.text.strip(): td.text.strip() for th, td in zip(ths, tds)}
free_delivery = Fmt.parse_date(free_delivery[0]) if len(free_delivery) else ""
return {
"free_delivery": free_delivery,
"product_dimensions": detail.get("Product Dimensions", ""),
"item_weight": detail.get("Item Weight", ""),
}
@retry(stop=stop_after_attempt(20), wait=wait_random(3, 6), retry_error_cls=lambda *_:...)
def run(self, task):
@retry(
stop=stop_after_attempt(20),
wait=wait_random(3, 6),
retry_error_callback=lambda *_: ...,
)
def run(self, task: dict):
url = task.get("url", "")
asin = Tool.get_url_asin(url)
url = f"https://www.amazon.{self.site}/dp/" + asin + "?th=1&psc=1"
_proxy = self.get_proxy()
# if IS_DEBUG:
# _proxy = {"proxy": None}
if _proxy is None:
raise Exception("没有代理")
raise ValueError("没有代理")
try:
headers = self.task_manager.get_loca_cookie(site=self.site)
headers = self.task_manager.get_loca_cookie(site=self.site, postcode=self.postcode.value)
text = Request.request_html(
url,
_proxy["proxy"],
**{"headers": headers, "timeout": self.request_timeout, "postcode": self.postcode},
**{
"headers": headers,
"timeout": self.request_timeout,
"postcode": self.postcode.value,
},
)
response = self.format_content(text)
logger.debug(response)
return response
except curl_cffi.curl.CurlError:
logger.error(f"请求超时: {url}")
raise
except Exception as e:
if str(e) == "出现验证码":
self.delete_proxy(_proxy["temp_proxy"])
......@@ -186,6 +208,6 @@ class Info(ProxyMixin):
if str(e) == "采集邮编错误":
self.cookie_error()
logger.error(f"请求异常: {e} - {url}")
raise
finally:
self.join_proxy(_proxy["temp_proxy"])
......@@ -14,8 +14,9 @@ from loguru import logger
from lxml import etree
from conf import config
from const import Lang, StockStatus
from const import Lang, Site, StockStatus
from const import SiteType
from db import RedisSingleton
DOMAIN = config["app"]["domain"]
COOKIE = config["cookie"]
......@@ -23,7 +24,7 @@ IMPERSONATE = config["app"]["impersonate"]
class Task:
def __init__(self, redis_singleton):
def __init__(self, redis_singleton: RedisSingleton):
self.redis_singleton = redis_singleton
def get_task(self, task_key: str = "", batch_size: int = 10):
......@@ -120,19 +121,32 @@ class Task:
redis_client.delete(time_key)
return cookie
def get_loca_cookie(self, site: str = "com"):
def get_loca_cookie(self, site: str = Site.com, postcode: str = None, only_local: bool = False):
"""
获取本地cookie
:return:
"""
redis_client = self.redis_singleton.get_connection()
cookie = redis_client.get(f"cookie:{site}")
key = f"cookie:{site}"
if postcode:
key += f":{postcode}"
cookie = redis_client.get(key)
if only_local:
return cookie
if not cookie:
cookie = self.get_cookie(site)
if isinstance(cookie, dict):
return cookie
return json.loads(cookie)
def set_loca_cookie(self, data: dict, site: str = Site.com, postcode: str = None):
redis_client = self.redis_singleton.get_connection()
key = f"cookie:{site}"
if postcode:
key += f":{postcode}"
redis_client.set(key, json.dumps(data))
class Request:
......@@ -189,7 +203,7 @@ class Request:
is_link_error = html.xpath('//div[@id="g"]/a/@href')
title = Tool.get_title(html)
if len(is_link_error) == 0 and len(title) == 0 and is_product_detail:
raise Exception(f"采集内容有误")
raise Exception("采集内容有误")
return text
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment