Commit 11e330a5 by yexing

u

parent 9fcc7b4f
__pycache__ __pycache__
.vscode .*
celerybeat-* celerybeat-*
.pytest_cache
log log
pid pid
image image
......
...@@ -21,6 +21,18 @@ class Data: ...@@ -21,6 +21,18 @@ class Data:
@classmethod @classmethod
def inverse_dict(cls): def inverse_dict(cls):
return {v: k for k, v in cls.items()} return {v: k for k, v in cls.items()}
class PropProxy:
def __init__(self, ref: type, prop: str):
self._ref = ref
self._prop = prop
@property
def value(self):
return getattr(self._ref, self._prop)
def __str__(self):
return str(self.value)
class Spider(Data): class Spider(Data):
...@@ -66,4 +78,8 @@ class StockStatus(Data): ...@@ -66,4 +78,8 @@ class StockStatus(Data):
class SiteType(Data): class SiteType(Data):
com = 1 com = 1
de = 2 de = 2
\ No newline at end of file it = 3
fr = 4
es = 5
jp = 6
from __future__ import annotations
import json import json
import re import re
import os
import curl_cffi import curl_cffi
from loguru import logger from loguru import logger
from tenacity import retry, stop_after_attempt, wait_random from tenacity import retry, stop_after_attempt, wait_random
from lxml import etree from lxml import etree
from const import Postcode, Site from const import Postcode, PropProxy, Site
from db import RedisSingleton from db import RedisSingleton
from proxy import ProxyManager from proxy import ProxyManager
from tool import Fmt, Request, Task from tool import Fmt, Request, Task
from conf import config from conf import config
IS_DEBUG = os.environ.get("IS_DEBUG", False)
REDIS = RedisSingleton(redis_url=config["redis"]["url"]) REDIS = RedisSingleton(redis_url=config["redis"]["url"])
task_info_config = config["task-info-detail"] task_info_config = config["task-info-detail"]
class Tool: class Tool:
@staticmethod @staticmethod
def get_amazon_sku(text): def get_amazon_sku(text):
...@@ -95,14 +98,14 @@ class ProxyMixin: ...@@ -95,14 +98,14 @@ class ProxyMixin:
:return: :return:
""" """
# if self.is_debug: if self.is_debug:
# test_proxy = "127.0.0.1:7890" test_proxy = "127.0.0.1:7890"
# proxy = "#1#2#127.0.0.1:7890" proxy = "#1#2#127.0.0.1:7890"
# else: else:
proxy = self.proxy_manager.get_proxy() proxy = self.proxy_manager.get_proxy()
if proxy is None: if proxy is None:
return None return None
test_proxy = proxy.split("#")[0] test_proxy = proxy.split("#")[0]
return { return {
"proxy": f"chensav:chensav@{test_proxy}", "proxy": f"chensav:chensav@{test_proxy}",
"temp_proxy": proxy, "temp_proxy": proxy,
...@@ -134,11 +137,11 @@ class ProxyMixin: ...@@ -134,11 +137,11 @@ class ProxyMixin:
redis.incr("amazon:cookie-error") redis.incr("amazon:cookie-error")
class Info(ProxyMixin): class InfoSpider(ProxyMixin):
site = Site.com site = Site.com
postcode = Postcode.com postcode = PropProxy(Postcode, site)
task_manager = Task(REDIS) task_manager = Task(REDIS)
def __init__(self): def __init__(self):
self.task_key = task_info_config["task_key"] self.task_key = task_info_config["task_key"]
self.item_key = task_info_config["item_key"] self.item_key = task_info_config["item_key"]
...@@ -146,39 +149,58 @@ class Info(ProxyMixin): ...@@ -146,39 +149,58 @@ class Info(ProxyMixin):
self.enabled = task_info_config["enabled"] == "True" self.enabled = task_info_config["enabled"] == "True"
self.request_timeout = int(task_info_config["request_timeout"]) self.request_timeout = int(task_info_config["request_timeout"])
self.is_debug = task_info_config["is_debug"] == "True" self.is_debug = task_info_config["is_debug"] == "True"
def format_content(self, text): def format_content(self, text):
html = etree.HTML(text) html = etree.HTML(text)
free_delivery = html.xpath( free_delivery = html.xpath(
'//div[@id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/span/text()' '//div[@id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/span/text()'
) )
detail_bullets = html.xpath('//*[@id="productDetails_detailBullets_sections1"]/tbody/tr') ths: list[etree._Element] = html.xpath(
'//*[@id="productDetails_detailBullets_sections1"]/tr/th'
)
tds: list[etree._Element] = html.xpath(
'//*[@id="productDetails_detailBullets_sections1"]/tr/td'
)
detail = {th.text.strip(): td.text.strip() for th, td in zip(ths, tds)}
free_delivery = Fmt.parse_date(free_delivery[0]) if len(free_delivery) else "" free_delivery = Fmt.parse_date(free_delivery[0]) if len(free_delivery) else ""
return { return {
"free_delivery": free_delivery, "free_delivery": free_delivery,
"product_dimensions": detail.get("Product Dimensions", ""),
"item_weight": detail.get("Item Weight", ""),
} }
@retry(
@retry(stop=stop_after_attempt(20), wait=wait_random(3, 6), retry_error_cls=lambda *_:...) stop=stop_after_attempt(20),
def run(self, task): wait=wait_random(3, 6),
retry_error_callback=lambda *_: ...,
)
def run(self, task: dict):
url = task.get("url", "") url = task.get("url", "")
asin = Tool.get_url_asin(url) asin = Tool.get_url_asin(url)
url = f"https://www.amazon.{self.site}/dp/" + asin + "?th=1&psc=1" url = f"https://www.amazon.{self.site}/dp/" + asin + "?th=1&psc=1"
_proxy = self.get_proxy() _proxy = self.get_proxy()
# if IS_DEBUG:
# _proxy = {"proxy": None}
if _proxy is None: if _proxy is None:
raise Exception("没有代理") raise ValueError("没有代理")
try: try:
headers = self.task_manager.get_loca_cookie(site=self.site) headers = self.task_manager.get_loca_cookie(site=self.site, postcode=self.postcode.value)
text = Request.request_html( text = Request.request_html(
url, url,
_proxy["proxy"], _proxy["proxy"],
**{"headers": headers, "timeout": self.request_timeout, "postcode": self.postcode}, **{
"headers": headers,
"timeout": self.request_timeout,
"postcode": self.postcode.value,
},
) )
response = self.format_content(text) response = self.format_content(text)
logger.debug(response)
return response return response
except curl_cffi.curl.CurlError: except curl_cffi.curl.CurlError:
logger.error(f"请求超时: {url}") logger.error(f"请求超时: {url}")
raise
except Exception as e: except Exception as e:
if str(e) == "出现验证码": if str(e) == "出现验证码":
self.delete_proxy(_proxy["temp_proxy"]) self.delete_proxy(_proxy["temp_proxy"])
...@@ -186,6 +208,6 @@ class Info(ProxyMixin): ...@@ -186,6 +208,6 @@ class Info(ProxyMixin):
if str(e) == "采集邮编错误": if str(e) == "采集邮编错误":
self.cookie_error() self.cookie_error()
logger.error(f"请求异常: {e} - {url}") logger.error(f"请求异常: {e} - {url}")
raise
finally: finally:
self.join_proxy(_proxy["temp_proxy"]) self.join_proxy(_proxy["temp_proxy"])
...@@ -14,8 +14,9 @@ from loguru import logger ...@@ -14,8 +14,9 @@ from loguru import logger
from lxml import etree from lxml import etree
from conf import config from conf import config
from const import Lang, StockStatus from const import Lang, Site, StockStatus
from const import SiteType from const import SiteType
from db import RedisSingleton
DOMAIN = config["app"]["domain"] DOMAIN = config["app"]["domain"]
COOKIE = config["cookie"] COOKIE = config["cookie"]
...@@ -23,7 +24,7 @@ IMPERSONATE = config["app"]["impersonate"] ...@@ -23,7 +24,7 @@ IMPERSONATE = config["app"]["impersonate"]
class Task: class Task:
def __init__(self, redis_singleton): def __init__(self, redis_singleton: RedisSingleton):
self.redis_singleton = redis_singleton self.redis_singleton = redis_singleton
def get_task(self, task_key: str = "", batch_size: int = 10): def get_task(self, task_key: str = "", batch_size: int = 10):
...@@ -120,19 +121,32 @@ class Task: ...@@ -120,19 +121,32 @@ class Task:
redis_client.delete(time_key) redis_client.delete(time_key)
return cookie return cookie
def get_loca_cookie(self, site: str = "com"): def get_loca_cookie(self, site: str = Site.com, postcode: str = None, only_local: bool = False):
""" """
获取本地cookie 获取本地cookie
:return: :return:
""" """
redis_client = self.redis_singleton.get_connection() redis_client = self.redis_singleton.get_connection()
cookie = redis_client.get(f"cookie:{site}") key = f"cookie:{site}"
if postcode:
key += f":{postcode}"
cookie = redis_client.get(key)
if only_local:
return cookie
if not cookie: if not cookie:
cookie = self.get_cookie(site) cookie = self.get_cookie(site)
if isinstance(cookie, dict): if isinstance(cookie, dict):
return cookie return cookie
return json.loads(cookie) return json.loads(cookie)
def set_loca_cookie(self, data: dict, site: str = Site.com, postcode: str = None):
redis_client = self.redis_singleton.get_connection()
key = f"cookie:{site}"
if postcode:
key += f":{postcode}"
redis_client.set(key, json.dumps(data))
class Request: class Request:
...@@ -189,7 +203,7 @@ class Request: ...@@ -189,7 +203,7 @@ class Request:
is_link_error = html.xpath('//div[@id="g"]/a/@href') is_link_error = html.xpath('//div[@id="g"]/a/@href')
title = Tool.get_title(html) title = Tool.get_title(html)
if len(is_link_error) == 0 and len(title) == 0 and is_product_detail: if len(is_link_error) == 0 and len(title) == 0 and is_product_detail:
raise Exception(f"采集内容有误") raise Exception("采集内容有误")
return text return text
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment