Commit 58d033ad by yexing

up

parents
### celery 命令
- celery -A celery_app beat -l info
- celery -A celery_app worker -Q spider -l info -P gevent -E
- celery -A celery_app flower --port=5555 --basic_auth=username:password
- celery -A celery_app inspect active
- celery -A celery_app purge -Q spider -f
- celery -A celery_app control shutdown
### 其他
conda activate amazon-mult-site
* */12 * * * supervisorctl -c /etc/supervisord.conf restart celery-beat:00
from time import sleep
import asyncio
import json
import random
import re
import time
import traceback
from typing import Self
import requests
from DrissionPage import Chromium
from DrissionPage.common import Settings
from DrissionPage.items import WebPageTab
from loguru import logger
from redis.asyncio import from_url
class Data:
@classmethod
def items(cls):
return (
(k, v)
for k, v in cls.__dict__.items()
if isinstance(v, (str, int)) and not k.startswith("__")
)
@classmethod
def values(cls):
return (v for _, v in cls.items())
@classmethod
def zip(cls, other: Self):
return ((v, getattr(other, k)) for k, v in cls.items())
@classmethod
def inverse_dict(cls):
return {v: k for k, v in cls.items()}
class Postcode(Data):
com = "20001"
de = "55545"
it = "66040"
fr = "75000"
es = "04810"
jp = "496-0805"
class Site(Data):
com = "com"
# 德意法西日本
de = "de"
it = "it"
fr = "fr"
es = "es"
jp = "co.jp"
IS_DEBUG = False
DOMAIN = "https://20tools.net"
redis_config = {
"url": "redis://:a123456,a@localhost:6379/10",
"max_connections": 300
}
cookie_config = {
"cookie_time_key": "cookie_expired_time"
}
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
Settings.set_raise_when_wait_failed(False)
class RedisSingleton:
_redis_pool = None
def __init__(self, redis_url=None):
self.redis_url = redis_url
def get_connection(self):
if self._redis_pool is None:
if self.redis_url:
self._redis_pool = from_url(self.redis_url, decode_responses=True)
else:
# 默认连接地址
self._redis_pool = from_url('redis://localhost', decode_responses=True)
return self._redis_pool
class SiteType(Data):
com = 1
de = 2
def callback(param):
"""
回调接口
"""
requests.post(
f"{DOMAIN}/api/collection/cookie",
headers={
"Content-Type": "application/json",
"Accept": "application/json",
},
data=json.dumps(param),
)
def refresh_local_cookie(data: dict, site: str = "com"):
"""
刷新本地cookie
"""
redis = redis_singleton.get_connection()
redis.set(f"cookie:{site}", json.dumps(data))
def get_cookie_error():
"""
获取cookie错误
:return:
"""
redis = redis_singleton.get_connection()
return redis.get("amazon:cookie-error")
def delete_cookie_error():
"""
删除cookie错误
:return:
"""
redis = redis_singleton.get_connection()
return redis.delete("amazon:cookie-error")
def input_postcode(
tab: WebPageTab, postcode: str, locator: str = "#GLUXZipUpdateInput"
):
location_input = tab.ele(locator, timeout=3)
if location_input is None:
raise Exception("未找到输入框")
location_input.input(postcode)
sleep(1)
def get_cookie(tab: WebPageTab, site_type: int = 1):
"""
获取cookie
"""
cookie = tab.cookies().as_str()
content = tab.html
token = re.findall(r"name=\"stylesnap\" value=\"(.*?)\">", content)
response = {
"cookie": cookie,
"token": token[0] if token else "",
"user-agent": UA,
"time": int(time.time()),
}
logger.info(f"获取到cookie: {json.dumps(response)}")
callback({"type": site_type, "data": response})
return cookie
def run(site: str = "com", postcode: str = "20001", site_type: int = 1):
def _close():
cookie = get_cookie(tab, site_type)
if IS_DEBUG:
refresh_local_cookie({"cookie": cookie, "user-agent": UA}, site=site)
chromium.clear_cache()
chromium.quit()
delete_cookie_error()
if not IS_DEBUG:
number = get_cookie_error()
number = int(number) if number else 0
if number < 50:
logger.success("Cookie正常")
return
logger.error("Cookie异常")
chromium = Chromium()
tab = chromium.latest_tab
try:
# &currency=JPY
tab.get(f"https://www.amazon.{site}/stylesnap?language=en_GB")
# 判断邮编是否正确
line = tab.ele("#glow-ingress-line2", timeout=3)
nav = tab.ele("#icp-nav-flyout", timeout=3)
if line and nav:
postal, lang = line.text, nav.text
if (not postal or postcode not in postal) or (not lang or not "EN" not in lang):
logger.info("邮编或语言错误, 开始设置邮编和语言")
else:
logger.info("邮编和语言正确")
_close()
return
location = tab.ele("#nav-global-location-popover-link", timeout=3)
if not location:
raise Exception("没有进入正确页面")
else:
location.click()
postcode_parts = postcode.split("-")
if len(postcode_parts) == 2:
input_postcode(tab, postcode_parts[0], "#GLUXZipUpdateInput_0")
input_postcode(tab, postcode_parts[1], "#GLUXZipUpdateInput_1")
else:
input_postcode(tab, postcode)
locs = [
"#GLUXZipUpdate",
'xpath://*[@id="a-popover-1"]/div/header/button',
'xpath://*[@id="icp-nav-flyout"]/button',
"@text()=English",
]
for loc in locs:
ele = tab.ele(loc, timeout=3)
if not ele:
raise ValueError("元素定位错误")
ele.wait.clickable(timeout=3, raise_err=False).click()
tab.wait(2)
_close()
except Exception as e:
logger.error(e)
def main():
if IS_DEBUG:
items = random.choices(list(Site.zip(Postcode)))
else:
items = Site.zip(Postcode)
for site, postcode in items:
site_type = SiteType.__dict__.get(site)
if site_type is None:
continue
logger.info(f"开始获取cookie: {site} {postcode}")
run(site, postcode)
sleep(10)
if IS_DEBUG:
exit()
if __name__ == "__main__":
while True:
try:
redis_singleton = RedisSingleton(redis_url=redis_config["url"])
asyncio.run(main())
except:
traceback.print_exc()
# 依赖项:
# pip install fastapi uvicorn
import json
import uvicorn
from fastapi import FastAPI, Query, HTTPException
from conf import config
from db import RedisSingleton
app = FastAPI(title="Redis Query API", description="API to query data from Redis")
# 从配置文件获取Redis URL
redis_url = config.get('redis', 'url')
redis_client = RedisSingleton(redis_url)
@app.get("/redis/query/")
def query_redis(
query_key: str = Query(..., description="Redis key to query"),
count: int = Query(1, description="Number of items to retrieve", ge=1)
):
"""
从Redis中获取指定key的值
- **query_key**: Redis中的键
- **count**: 要获取的数量,默认为1
"""
try:
# 获取Redis连接
redis_conn = redis_client.get_connection()
# 如果是列表类型,获取指定数量的元素
values = redis_conn.lpop(query_key, count - 1)
# 将字符串转换为字典
new_values = [json.loads(value) for value in values]
return {"key": query_key, "type": "list", "values": new_values, "count": len(values)}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error querying Redis: {str(e)}")
if __name__ == "__main__":
# 启动FastAPI应用
uvicorn.run(app, host="0.0.0.0", port=9012)
import json
import os
import threading
import time
import redis
from curl_cffi import requests
from loguru import logger
from conf import config
COLL_DOMAIN = config['app']['coll_domain']
_redis_db = redis.Redis.from_url(config['redis']['url'], decode_responses=True)
task_monitoring_config = config['task-monitoring']
task_search_config = config['task-search']
task_product_detail_config = config['task-product-detail']
DEFAULT_HEADER = {
"Content-Type": "application/json",
"Accept": "application/json",
}
def task_callback(data):
"""
回调接口
"""
try:
url = f"{COLL_DOMAIN}/api/collection/task"
response = requests.post(url, headers=DEFAULT_HEADER, data=json.dumps(data), verify=False)
data = response.json()
if data["code"] == 0:
return True
else:
return False
except Exception as e:
logger.error(f"回调异常 : {e}")
return False
def batch_callback(callback_key: str):
thread = []
number = _redis_db.llen(callback_key)
logger.info(f"回调 {callback_key},共有{number}个任务")
for _ in range(10):
data = _redis_db.lpop(callback_key)
if data:
result = json.loads(data)
thread.append(threading.Thread(target=task_callback, args=(result,)))
else:
break
for t in thread:
t.start()
for t in thread:
t.join()
def callback_task(callback_key: str):
"""
回调任务
:param callback_key:
:return:
"""
task_number = 500
result = []
try:
if callback_key == task_monitoring_config.get('item_key'):
for _ in range(task_number):
data = _redis_db.lpop(callback_key)
if data:
result.append(json.loads(data))
else:
break
if result:
logger.info(f"回调 {callback_key},共有{len(result)}个任务")
logger.info(f"回调: result: {json.dumps(result)}")
callback = {
"data": {
"error_items": [],
"collection": result,
},
"type": 5,
}
task_callback(callback)
elif callback_key == task_product_detail_config.get('item_key') or callback_key == task_search_config.get('item_key'):
batch_callback(callback_key)
except:
logger.error(f"回调异常")
def run(task_config: dict = task_monitoring_config):
CALLBACK_PID_FILE = "./pid/callback.pid"
while True:
if not os.path.exists(CALLBACK_PID_FILE):
logger.error('任务退出')
break
try:
callback_key = task_config.get('item_key')
callback_task(callback_key)
logger.info(f"回调 {callback_key} 完成")
except Exception as e:
logger.error(f"任务异常 : {e}")
callback_sleep_time = int(task_config.get('callback_sleep_time', 5))
time.sleep(callback_sleep_time)
if __name__ == '__main__':
tasks = []
PID_FILES = [
"monitoring.pid",
"product_detail.pid",
"search.pid",
"callback.pid",
]
for PID_FILE in PID_FILES:
with open(f"./pid/{PID_FILE}", 'w') as f:
f.write(str(os.getpid()))
if task_monitoring_config.get('enabled', None) == 'True':
logger.info(f"采集任务回调启动")
t = threading.Thread(target=run, args=(task_monitoring_config,))
tasks.append(t)
if task_product_detail_config.get('enabled', None) == 'True':
logger.info(f"商品发布回调启动")
t = threading.Thread(target=run, args=(task_product_detail_config,))
tasks.append(t)
if task_search_config.get('enabled', None) == 'True':
logger.info(f"搜索回调启动")
t = threading.Thread(target=run, args=(task_search_config,))
tasks.append(t)
for t in tasks:
t.start()
for t in tasks:
t.join()
from datetime import timedelta
from celery import Celery
from kombu import Queue, Exchange
from conf import config
redis_url = config["redis"]["url"]
app = Celery(
"amazon",
broker=redis_url,
backend=redis_url,
include=["celery_tasks"],
)
app.conf.update(
task_serializer="json",
task_track_started=True,
task_create_missing_queues=True,
worker_concurrency=1,
worker_max_tasks_per_child=100,
worker_max_memory_per_child=200_000,
worker_prefetch_multiplier=2,
result_expires=300,
)
app.conf.broker_transport_options = {"global_keyprefix": "celery_broker"}
app.conf.result_backend_transport_options = {"prefix": "celery_backend"}
app.conf.timezone = "Asia/Shanghai"
app.conf.enable_utc = False
app.conf.singleton_lock_expiry = 600
app.conf.task_queues = (
Queue("spider", Exchange("spider"), routing_key="spider"),
Queue("dial", Exchange("dial"), routing_key="dial"),
)
app.conf.task_routes = {
"celery_tasks.detail_spider_task": {"queue": "detail"},
"celery_tasks.monitor_spider_task": {"queue": "detail"},
"celery_tasks.search_spider_task": {"queue": "search"},
"celery_tasks.*_dial_task": {"queue": "dial"},
"celery_tasks.*": {"queue": "detail"},
}
app.conf.beat_schedule = {
"start_spider_task": {
"task": "celery_tasks.start_spider_task",
"schedule": timedelta(seconds=5),
},
"start_dial_task": {
"task": "celery_tasks.start_dial_task",
"schedule": timedelta(seconds=5),
},
}
app.conf.ONCE = {
'backend': 'celery_once.backends.Redis',
'settings': {
'url': redis_url,
'default_timeout': 30
}
}
from __future__ import annotations
from time import sleep
import asyncio
import functools
import json
import os
from types import ModuleType
import tldextract
from celery import current_app, shared_task
from celery_singleton import Singleton
from celery_once import QueueOnce
from loguru import logger
from conf import config
from const import Site, Spider
from db import RedisSingleton
from dial import Dial
from proxy import ProxyManager
from spider import detail, monitor, search
from tool import Task
IS_DEBUG = int(os.environ.get("IS_DEBUG", False))
@shared_task(base=Singleton)
def start_spider_task(*keys: str):
for key in keys or Spider.values():
if config[key]["enabled"] != "True":
logger.debug(f"skip: {key}")
continue
assign_spider_task.delay(key)
@shared_task(base=Singleton)
def start_dial_task():
server = config["dial"]["server"]
server_list = json.loads(server)
for server in server_list:
check_dial_task.delay(server)
@shared_task(base=QueueOnce, once={'keys': ['server']})
def check_dial_task(server):
redis = RedisSingleton(redis_url=config["redis"]["url"])
proxyManage = ProxyManager(redis)
is_dial = config["dial"]["is_debug"] != "True"
dial = Dial(is_dial=is_dial, nameserver="223.5.5.5\nnameserver 119.29.29.29")
logger.debug(f"dial: {server}")
dial.check_dial(server, proxyManage)
@shared_task(bind=True)
def clean_proxy_task(self):
redis = RedisSingleton(redis_url=config["redis"]["url"])
proxy_manager = ProxyManager(redis)
proxy_manager.delete_all_proxy()
def base_spider(task: dict, site: str, module: ModuleType, class_name: str):
redis = RedisSingleton(redis_url=config["redis"]["url"])
cls = getattr(module, site.capitalize() + class_name)
if cls:
result = cls(redis).run(task)
logger.info(f"finish: {task.get('url')}")
return result
@shared_task(bind=True, autoretry_for=(Exception,), max_retries=10, retry_backoff=True)
def detail_spider_task(self, task: dict, site: str):
base_spider(task, site, detail, "Goods")
@shared_task(bind=True, autoretry_for=(Exception,), max_retries=10, retry_backoff=True)
def monitor_spider_task(self, task: dict, site: str):
base_spider(task, site, monitor, "Monitoring")
@shared_task(bind=True, autoretry_for=(Exception,), max_retries=10, retry_backoff=True)
def search_spider_task(self, task: dict, site: str):
base_spider(task, site, search, "Search")
@shared_task(bind=True, autoretry_for=(Exception,), max_retries=10, retry_backoff=True)
def assign_spider_task(self, key: str, tasks: list = None):
logger.debug(f"assign: {key}")
redis = RedisSingleton(redis_url=config["redis"]["url"])
proxy_manager = ProxyManager(redis)
proxy_number = proxy_manager.get_proxy_number()
logger.debug(f"代理数量: {proxy_number}")
if proxy_number < 8:
logger.warning("代理不足,任务终止")
sleep(1)
return
if tasks is None:
task_manager = Task(redis)
task_key = config[key]["task_key"]
task_number = int(config[key]["task_number"])
tasks = task_manager.get_task(task_key, task_number)
tasks = [json.loads(task) for task in tasks]
# tasks = [{"url": "https://www.amazon.de/"}] # Cause errors
inv = Site.inverse_dict()
for task in tasks:
url: str = task.get("url")
site = inv.get(tldextract.extract(url).suffix) if url else None
if not site:
logger.error(f"域名解析失败: {site}")
continue
if key == Spider.detail:
detail_spider_task.apply_async(
args=(task, site),
link_error=handle_failed_task.s(task_name=detail_spider_task.name),
)
elif key == Spider.monitor:
monitor_spider_task.apply_async(
args=(task, site),
link_error=handle_failed_task.s(task_name=monitor_spider_task.name),
)
elif key == Spider.search:
search_spider_task.apply_async(
args=(task, site),
link_error=handle_failed_task.s(task_name=search_spider_task.name),
)
@shared_task
def handle_failed_task(*args, **kwargs):
ctx, reason, _ = args
logger.info(f"task: {ctx.id}, reason: {reason}")
if not isinstance(reason, RuntimeError):
return
task_name = kwargs.get("task_name")
_trace = (ctx.headers or {}).get("_trace")
_trace = _trace + 1 if _trace else 1
logger.debug(f"trace: {_trace}")
current_app.tasks[task_name].apply_async(
args=ctx.args,
kwargs=ctx.kwargs,
countdown=30,
headers={"_trace": _trace},
link_error=handle_failed_task.s(task_name=task_name),
)
import configparser
import os
config = configparser.ConfigParser()
# 读取配置文件是否存在
config_path = os.path.join(os.path.dirname(__file__), 'config/config.ini')
if not config.read(config_path, encoding='utf-8'):
raise FileNotFoundError("配置文件不存在")
config.read('config.ini', encoding='utf-8')
# 校验配置文件是否正确
if 'redis' not in config:
raise ValueError("Redis 配置不存在")
else:
redis_config = config['redis']
if 'url' not in redis_config:
raise ValueError("Redis URL 不存在")
if 'max_connections' not in redis_config:
raise ValueError("Redis max_connections 不存在")
if 'app' not in config:
raise ValueError("App 配置不存在")
if 'task-monitoring' not in config:
raise ValueError("商品监控配置不存在")
if 'task-search' not in config:
raise ValueError("搜索配置不存在")
if 'cookie' not in config:
config['cookie'] = {
"cookie_time_key": "cookie_expired_time"
}
if 'impersonate' not in config['app']:
config['app']['impersonate'] = "chrome"
from typing import Self
class Data:
@classmethod
def items(cls):
return (
(k, v)
for k, v in cls.__dict__.items()
if isinstance(v, (str, int)) and not k.startswith("__")
)
@classmethod
def values(cls):
return (v for _, v in cls.items())
@classmethod
def zip(cls, other: Self):
return ((v, getattr(other, k)) for k, v in cls.items())
@classmethod
def inverse_dict(cls):
return {v: k for k, v in cls.items()}
class Spider(Data):
detail = "task-product-detail"
monitor = "task-monitoring"
search = "task-search"
class Site(Data):
com = "com"
# 德意法西日本
de = "de"
it = "it"
fr = "fr"
es = "es"
jp = "co.jp"
class Postcode(Data):
com = "20001"
de = "55545"
it = "66040"
fr = "75000"
es = "04810"
jp = "496-0805"
class Lang(Data):
com = "en_US"
de = "de_DE"
it = "it_IT"
fr = "fr_FR"
es = "es_ES"
jp = "ja_JP"
class StockStatus(Data):
com = "In Stock"
de = "Auf Lager"
it = "Disponibilità immediata"
fr = "En stock"
es = "En stock"
jp = "现在有货"
class SiteType(Data):
com = 1
de = 2
\ No newline at end of file
# import aioredis
from redis.asyncio import from_url
class RedisSingleton:
_redis_pool = None
def __init__(self, redis_url=None):
self.redis_url = redis_url
def get_connection(self):
if self._redis_pool is None:
if self.redis_url:
self._redis_pool = from_url(self.redis_url, decode_responses=True)
else:
# 默认连接地址
self._redis_pool = from_url('redis://localhost', decode_responses=True)
return self._redis_pool
from time import sleep
import asyncio
import json
import os
import socket
from multiprocessing import Pool
from fabric2 import Connection
from loguru import logger
from conf import config
from db import RedisSingleton
from proxy import ProxyManager
IS_DEBUG = int(os.environ.get("IS_DEBUG", False))
def get_server_info(item: str):
"""
:param item:
:return:
"""
info = item.split(":")
if len(info) < 4:
raise ValueError("拨号服务器配置错误")
return {
"name": info[0],
"host": info[1],
"port": int(info[2]),
"password": info[3],
"proxy_number": int(info[4]) if len(info) == 5 else 8,
}
def command_run(conn, command):
"""
run command
:param conn:
:param command:
:return:
"""
try:
item = conn.run(command, timeout=10, hide=True, warn=True)
if item.ok:
return item.stdout
return None
except socket.timeout:
return None
except Exception:
return None
class Dial:
_first_check = True
def __init__(self, **kwargs):
"""
server: server object [{name}:host:port:password:{代理数}]
:param kwargs:
"""
# self.server = kwargs.get('server', None)
# if self.server is None:
# raise ValueError("server is required")
self.nameserver = kwargs.get("nameserver", None)
self.dial_time = kwargs.get("dial_time", 5)
self.restart = kwargs.get("restart", True)
self.redis_url = kwargs.get("redis_url", "redis://localhost:6379/0")
self.loop_number = kwargs.get("loop_number", 600)
self.is_dial = kwargs.get("is_dial", False)
def dial(self, item, proxy_client):
try:
server_info = get_server_info(item)
dial_name = server_info["name"]
conn = Connection(
host=f"{server_info['host']}",
user="root",
port=server_info["port"],
connect_kwargs={"password": server_info["password"]},
)
except Exception as e:
logger.error(f"服务器连接异常: {item} : {e}")
return
for _ in range(10):
try:
command_list = []
if self.is_dial:
logger.info(f"重新拨号: {dial_name}")
command_list.append("pppoe-stop")
command_list.append("pppoe-start")
if self.nameserver:
command_list.append(
f'echo -e "nameserver {self.nameserver}" | tee /etc/resolv.conf > /dev/null && chmod 644 /etc/resolv.conf && cat /etc/resolv.conf'
)
if self.restart:
command_list.append("tp start;tp restart")
for command in command_list:
command_run(conn, command)
logger.success(f"{dial_name}: 拨号成功")
proxy = None
for _ in range(10):
sleep(self.dial_time)
proxy = command_run(conn, "curl ifconfig.me")
if proxy:
break
if proxy:
logger.info(f"{dial_name} - 代理: {proxy}")
proxy_client.add_proxy(
name=server_info["name"],
proxy=f"{proxy}:3328",
proxy_number=server_info["proxy_number"],
)
break
except Exception as e:
logger.error(f"{dial_name}: 拨号失败 - {e}")
continue
conn.close()
def check_dial(self, item, proxy_client: ProxyManager):
"""
校验拨号
:param item:
:param proxy_client:
:return:
"""
server_info = get_server_info(item)
dial_name = server_info["name"]
proxy_number = server_info["proxy_number"]
sem = asyncio.Semaphore(proxy_number)
ttl_time = proxy_client.check_proxy_expire(dial_name)
if ttl_time and proxy_client.get_proxy_number() >= 8:
return
try:
proxy_client.delete_proxy(dial_name)
with sem:
self.dial(item, proxy_client)
except Exception as e:
logger.error(f"拨号失败: {item}, {e}")
def run(self, item, proxy_client):
"""
:param item:
:param proxy_client:
:return:
"""
try:
loop = asyncio.get_event_loop()
loop.run_until_complete(self.check_dial(item, proxy_client))
except KeyboardInterrupt:
logger.error(f"任务终止")
except Exception as e:
logger.error(f"任务终止: {e}")
if __name__ == "__main__":
server = config["dial"]["server"]
server_list = json.loads(server)
redis_config = config["redis"]
pool = Pool(processes=len(server_list))
redis_singleton = RedisSingleton(redis_url=redis_config["url"])
proxyManage = ProxyManager(redis_singleton)
results = []
# dial = Dial(server=server_list, is_dial=True, nameserver='119.29.29.29\nnameserver 223.5.5.5')
dial = Dial(
server=server_list,
is_dial=False,
nameserver="223.5.5.5\nnameserver 119.29.29.29",
)
for server in server_list:
result = pool.apply_async(dial.run, args=(server, proxyManage))
results.append(result)
for i in results:
i.wait()
import asyncio
import socket
import paramiko
from fabric2 import Connection
from loguru import logger
def get_server_info(item: str):
"""
:param item:
:return:
"""
info = item.split(":")
if len(info) < 4:
raise ValueError("拨号服务器配置错误")
return {
"name": info[0],
"host": info[1],
"port": int(info[2]),
"password": info[3],
"proxy_number": int(info[4]) if len(info) == 5 else 8,
}
def command_run(conn, command):
"""
run command
:param conn:
:param command:
:return:
"""
try:
item = conn.run(command, timeout=10, hide=True, warn=True)
if item.ok:
return item.stdout
return None
except socket.timeout:
return None
except Exception:
return None
def run(item):
server_info = get_server_info(item)
dial_name = server_info["name"]
logger.info(f"开始拨号: {dial_name}")
try:
conn = Connection(
host=f"{server_info['host']}",
user="root",
port=server_info["port"],
connect_kwargs={"password": server_info["password"]},
connect_timeout=10,
)
command_run(conn, "pppoe-start")
proxy = command_run(conn, "curl ifconfig.me")
if proxy:
logger.info(f"{dial_name} {proxy}")
else:
logger.error(f"{dial_name} None")
conn.close()
except paramiko.SSHException:
logger.error(f"服务器连接异常: {dial_name}")
if __name__ == "__main__":
loop = asyncio.get_event_loop()
dials = ["qcs64rk21:58.241.175.244:20125:WyyOF214982d:8","qcs64rk22:58.241.175.245:20371:WyyOF214982d:8","qcs64rk23:58.241.175.245:20315:WyyOF214982d:8","qcs64rk24:58.241.175.245:20063:WyyOF214982d:8","qcs64rk25:58.241.175.243:20015:WyyOF214982d:8","qcs64rk26:58.241.175.243:20335:WyyOF214982d:8","qcs64rk27:58.241.175.245:20221:WyyOF214982d:8","qcs64rk28:58.241.175.244:20299:WyyOF214982d:8","qcs64rk29:58.241.175.243:20259:WyyOF214982d:8","qcs64rk210:58.241.175.245:20151:WyyOF214982d:8","qcs64rk211:58.241.175.243:20173:WyyOF214982d:8","qcs64rk212:58.241.175.243:20149:WyyOF214982d:8","qcs64rk213:58.241.175.243:20157:WyyOF214982d:8","qcs64rk214:58.241.175.245:20207:WyyOF214982d:8","qcs64rk215:58.241.175.243:20061:WyyOF214982d:8","ahvxr8n4:jmdx2.leyuyun.com:20025:nBMf28x17M32:8","x9cg9u8p5:61.175.187.148:20283:1IZce6p3A524:8","x9cg9u8p4:61.175.187.148:20331:1IZce6p3A524:8","x9cg9u8p3:61.175.187.148:20177:1IZce6p3A524:8","x9cg9u8p2:61.175.187.148:20429:1IZce6p3A524:8","x9cg9u8p1:61.175.187.148:20189:1IZce6p3A524:8"]
for item in dials:
asyncio.run(run(item))
import logging
class ProxyManager:
def __init__(self, redis_singleton, **kwargs):
self.proxy_set_key = kwargs.get('proxy_set_key', 'dial-proxies')
self.proxy_expire_key = kwargs.get('proxy_expire_key', 'dial-proxies-expire')
self.proxy_key = kwargs.get('proxy_key', 'dial-proxies')
self.redis_singleton = redis_singleton
def add_proxy(self, name: str = None, proxy: str = None, during: int = 360000, proxy_number: int = 8):
"""
添加代理
:param name:
:param proxy:
:param during:
:param proxy_number:
:return:
"""
# 将代理添加到集合中
redis_client = self.redis_singleton.get_connection()
proxy_key = f"{self.proxy_expire_key}:{name}"
redis_client.set(proxy_key, 1, ex=during)
proxy_key = f"{self.proxy_key}:{name}"
redis_client.set(proxy_key, proxy)
for i in range(proxy_number):
self.join_proxy("#".join([proxy, f"{name}#{i}"]), is_first=True)
def delete_proxy(self, name):
"""
删除代理
:param name:
:return:
"""
proxy = self.get_use_proxy(name)
redis_client = self.redis_singleton.get_connection()
if proxy is not None:
for i in range(50):
redis_client.srem(self.proxy_set_key, "#".join([proxy, f"{name}#{i}"]))
redis_client.delete(f"{self.proxy_expire_key}:{name}")
redis_client.delete(f"{self.proxy_key}:{name}")
def delete_all_proxy(self):
"""
删除所有代理
:return:
"""
redis_client = self.redis_singleton.get_connection()
keys = redis_client.scan_iter(f"{self.proxy_expire_key}*")
for key in keys:
redis_client.delete(key)
keys = redis_client.scan_iter(f"{self.proxy_key}*")
for key in keys:
redis_client.delete(key)
redis_client.delete(self.proxy_set_key)
def get_use_proxy(self, name):
"""
获取使用的代理
:param name:
:return:
"""
redis_client = self.redis_singleton.get_connection()
proxy_key = f"{self.proxy_key}:{name}"
proxy = redis_client.get(proxy_key)
return proxy
def check_proxy_expire(self, name):
"""
检查代理是否过期
:param name:
:return:
"""
# 判断代理是否过期
redis_client = self.redis_singleton.get_connection()
proxy_key = f"{self.proxy_expire_key}:{name}"
ttl = redis_client.ttl(proxy_key)
return ttl > 0
def get_proxy(self):
"""
获取代理
:return:
"""
redis_client = self.redis_singleton.get_connection()
proxy = redis_client.spop(self.proxy_set_key)
return proxy if proxy else None
def get_proxy_number(self):
"""
获取代理数量
:return:
"""
redis_client = self.redis_singleton.get_connection()
number = redis_client.scard(self.proxy_set_key)
return number
def join_proxy(self, proxy: str = None, is_first: bool = False):
"""
将代理加入到代理集合中
:param proxy:
:param is_first:
:return:
"""
name = proxy.split('#')[1]
host = proxy.split('#')[0]
redis_client = self.redis_singleton.get_connection()
old_proxy = self.get_use_proxy(name)
if (old_proxy and old_proxy == host) or is_first:
redis_client.sadd(self.proxy_set_key, proxy)
else:
logging.error(f"代理已经被切换, 不加入: {proxy}")
[pytest]
addopts = -vs -p no:warnings
testpaths = ./test
python_files = test_*.py
python_classes = Test*
python_functions = test*
\ No newline at end of file
asgiref==3.8.1
Babel==2.17.0
celery==5.5.2
celery_singleton==0.3.1
curl_cffi==0.10.0
DrissionPage==4.1.0.18
DrissionPage==4.1.0.18
fabric2==3.2.2
fastapi==0.115.12
loguru==0.7.3
lxml==5.4.0
pytest==8.3.5
python_dateutil==2.9.0.post0
redis==6.1.0
Requests==2.32.3
tldextract==5.3.0
uvicorn==0.34.2
flower
from const import Postcode, Site, Lang, StockStatus
class Adapter(type):
def __new__(cls, name, bases, attrs):
code = name[:2].lower() # 类名格式 XX<Type>
attrs.update({
"site": getattr(Site, code),
"postcode": getattr(Postcode, code),
"lang": getattr(Lang, code),
"stock_status": getattr(StockStatus, code)
})
return super().__new__(cls, name, bases, attrs)
from time import sleep
import asyncio
import html
import json
import random
import re
import sys
import time
import traceback
import curl_cffi
import unicodedata
from loguru import logger
from lxml import etree
from conf import config
from const import Lang, Postcode, Site, StockStatus
from db import RedisSingleton
from proxy import ProxyManager
from tool import Fmt, Task, Request
redis_config = config["redis"]
task_monitoring_config = config["task-product-detail"]
redis_singleton = RedisSingleton(redis_url=config["redis"]["url"])
class Tool:
@staticmethod
def get_amazon_sku(text):
"""
获取amazon_sku
:param text:
:return:
"""
amazon_skus = re.findall(r"dimensionValuesDisplayData\" : ({[\s\S]*?}),", text)
if len(amazon_skus) == 0:
amazon_skus = re.findall(r"dimensionValuesData\": ({[\s\S]*?}),", text)
if len(amazon_skus):
amazon_skus = json.loads(amazon_skus[0])
return amazon_skus
@staticmethod
def get_url_asin(url: str):
patterns = ["dp/(.+?)\?", "dp/(.+?)\/", "dp/(.+)"]
asin = ""
for pattern in patterns:
asin = re.findall(pattern, url)
if len(asin):
asin = asin[0]
if len(asin) > 10:
asin = asin.split("/")[0]
break
return asin
@staticmethod
def get_book_asin(url: str, asin: str):
"""
获取图书的asin
:param url:
:param asin:
:return:
"""
if url == "javascript:void(0)":
return asin
asin = re.findall(r"/dp/(\w+)", url)
if not asin:
return ""
return asin[0]
@staticmethod
def get_title(html):
"""
获取标题
:param html:
:return:
"""
title = html.xpath('//span[@id="productTitle"]/text()')
if len(title) == 0:
title = html.xpath('//span[@id="bond-title-desktop"]/text()')
return title
@staticmethod
def get_data_json(text):
"""
获取data_json
:param text:
:return:
"""
data_json = re.findall("jQuery.parseJSON\('(.*)'\)", text)
data_json = data_json[0].replace("\\'", "'")
return json.loads(data_json)
class Cache:
@staticmethod
def get_cache(key: str):
"""
获取缓存
:param key:
:return:
"""
redis = redis_singleton.get_connection()
return redis.get(key)
@staticmethod
def set_cache(key: str, value: str, expire: int = 60):
"""
设置缓存
:param key:
:param value:
:param expire:
:return:
"""
redis = redis_singleton.get_connection()
redis.set(key, value, ex=expire)
# Useless
def format_result(result: dict, task: dict):
"""
格式化结果
:param result:
:param task:
:return:
"""
admin_users_id = task.get("admin_users_id")
item_id = task.get("item_id")
app_name = task.get("app_name", "admin")
free_delivery = Fmt.parse_date(result["free_delivery"])
return {
"price": result["price"],
"parent_asin": result.get("parent_asin", ""),
"category_name": result["category_name"],
"category_names": result["category_names"],
"ship_from": result["ship_from"],
"sold_by": result["sold_by"],
"status": result["status"],
"free_delivery": free_delivery,
"delivery_info": result["delivery_info"],
"is_link_error": result["is_link_error"],
"sku_name": result["sku_name"],
"admin_users_id": admin_users_id,
"item_id": item_id,
"asin": result["asin"],
"is_unavailable": result["is_unavailable"],
"star_level": result["star_level"],
"app_name": app_name,
"step_value": task.get("step_value"),
}
def get_brand(html):
"""
获取品牌
:param html:
:return:
"""
brand = html.xpath('//tr[@class="a-spacing-small po-brand"]/td[2]/span/text()')
if len(brand) == 0:
try:
brand = html.xpath('//a[@id="bylineInfo"]/text()')[0]
if brand.find("Visit the") != -1:
pattern = r"Visit the (.+?) Store"
match = re.search(pattern, brand)
brand = [match.group(1)] if match else ""
elif brand.find("Brand: ") != -1:
brand = [brand.replace("Brand: ", "")]
else:
brand = ""
except:
pass
# 从商品详情中获取品牌
if not brand:
product_details = html.xpath(
'//table[@id="productDetails_detailBullets_sections1"]//tr/th/text()'
)
if len(product_details) == 0:
brand_number = 0
for item in product_details:
brand_number += 1
if item.find("Brand") != -1:
break
brand = html.xpath(
f'//table[@id="productDetails_detailBullets_sections1"]//tr[{brand_number}]//td/text()'
)
if brand:
brand = [brand[0].strip()]
# 从店铺中获取品牌
if not brand:
brand = html.xpath('//a[@id="bylineInfo"]/@href')
if len(brand):
brand = [brand[0].split("/")[2]]
else:
brand = ""
return brand
def get_dimensions_display(text):
"""
获取商品规格
:param text:
:return:
"""
dimensions_display = re.findall(r"dimensionsDisplay\" : (\[[\s\S]*?\]),", text)
if len(dimensions_display) == 0:
dimensions_display = re.findall(
r"variationDisplayLabels\" : ({[\s\S]*?})", text
)
if len(dimensions_display) > 0:
dimensions_display = json.loads(dimensions_display[0])
dimensions_display = [value for name, value in dimensions_display.items()]
else:
dimensions_display = json.loads(dimensions_display[0])
return dimensions_display
def get_sku_name(skus, asin, dimensions_display):
"""
获取sku名称
:param skus:
:param asin:
:param dimensions_display:
:return:
"""
new_list = {}
for key, value in skus.items():
for i, v in enumerate(value):
if dimensions_display[i] not in new_list:
new_list[dimensions_display[i]] = []
new_list[dimensions_display[i]].append(v)
new_dimensions_display = []
for key, value in new_list.items():
new_list[key] = len(list(set(value)))
# 按照 值的大小排序
new_list = sorted(new_list.items(), key=lambda x: x[1], reverse=True)
for i in new_list:
new_dimensions_display.append(i[0])
sku = skus.get(asin, "")
if not sku:
return ""
if dimensions_display != new_dimensions_display:
# 重新组合 sku
new_sku = []
for i in new_dimensions_display:
for index, value in enumerate(dimensions_display):
if i == value:
new_sku.append(sku[index])
sku = new_sku
return sku
def get_description(html):
"""
获取商品详情
:param html:
:return:
"""
xpath_list = {
"product_description": '//div[@id="aplus_feature_div"]//p[not(ancestor::table)] | //div[@id="aplus_feature_div"]//span[@class="a-text-bold"][not(ancestor::table)] | //div[@id="aplus_feature_div"]//img[not(ancestor::table)] | //div[@id="aplus_feature_div"]//h2[not(ancestor::table)] | //div[@id="aplus_feature_div"]//h1[not(ancestor::table)] | //div[@id="aplus_feature_div"]//table[@class="apm-tablemodule-table"] | //div[@id="productDescription_feature_div"]//span | //div[@id="productDescription_feature_div"]//h2',
"form_brand": '//div[@id="aplusBrandStory_feature_div"]//p | //div[@id="aplusBrandStory_feature_div"]//div[@class="apm-brand-story-background-image"]//img | //div[@id="aplusBrandStory_feature_div"]//h2',
}
result_html = ""
for elements_xpath in xpath_list:
elements = html.xpath(xpath_list[elements_xpath])
# 循环顺序打印 features 里面的所有值和图片,用 p 标签 区分
tables_data = []
for element in elements:
if element.tag == "p":
result_html += f"<p>{element.text}</p>"
elif element.tag == "img" and element.get("src"):
if element.get("src").lower().endswith(".gif"):
continue
result_html += f"<img src='{element.get('src')}' />"
elif element.tag == "h2":
result_html += f"<h2>{element.text}</h2>"
elif element.tag == "span":
result_html += f"<p>{element.text}</p>"
elif element.tag == "h1":
result_html += f"<h1>{element.text}</h1>"
return result_html
def get_book_description(html):
"""
:param html:
:return:
"""
html = html.xpath('//div[@id="bookDescription_feature_div"]/div/div')[0]
xpath_list = {
"description": './/span[@class="a-text-italic"] | .//span[@class="a-text-bold"] | .//span'
}
# 转为字符串
html_str = etree.tostring(html, encoding="utf-8").decode("utf-8")
html_str = html_str.replace("\n", "").replace("\t", "").replace("<br/>", "")
html = etree.HTML(html_str)
result = []
result_html = ""
for elements_xpath in xpath_list:
elements = html.xpath(xpath_list[elements_xpath])
# 循环顺序打印 features 里面的所有值和图片,用 p 标签 区分
for element in elements:
if element.tag == "span":
if not element.text:
continue
if element.text == "Read more":
continue
result.append(element.text)
elif element.tag == "h1":
result.append(element.text)
return result
def get_color_images(text):
"""
获取color_images
:param text:
:return:
"""
color_images = re.findall(r"'colorImages': ({[\s\S]*?}),[\n\s]+'", text)
data = color_images[0].replace("'", '"')
return json.loads(data)
def format_content(text: str = "", lang = Lang.com, stock_status = StockStatus.com):
"""
获取亚马逊商品详情
:param text:
:return:
"""
html = etree.HTML(text)
data_json = {}
try:
data_json = Tool.get_data_json(text)
except:
pass
title = Tool.get_title(html)
is_unavailable = html.xpath(
'//div[@class="a-section a-spacing-small a-text-center"]//span[@class="a-color-price a-text-bold"]/text()'
)
price = html.xpath('//input[@id="twister-plus-price-data-price"]/@value')
ship_from = html.xpath(
'//div[@id="fulfillerInfoFeature_feature_div"]/div/div[@class="offer-display-feature-text a-spacing-none"]/span/text()'
)
if not ship_from:
ship_from = html.xpath(
'//div[@id="fulfillerInfoFeature_feature_div"]/div/div[@class="offer-display-feature-text a-spacing-none "]/span/text()'
)
sold_by = html.xpath(
'//div[@id="merchantInfoFeature_feature_div"]/div/div[@class="offer-display-feature-text a-spacing-none"]/span//text()'
)
if not sold_by:
sold_by = html.xpath(
'//div[@id="merchantInfoFeature_feature_div"]/div/div[@class="offer-display-feature-text a-spacing-none "]/span//text()'
)
if not ship_from:
ship_from = html.xpath(
'//div[@id="fulfillerInfoFeature_feature_div"]/div/div[@class="offer-display-feature-text a-spacing-none "]/span/text()'
)
if not ship_from:
ship_from = html.xpath(
'//div[@id="fulfillerInfoFeature_feature_div"]/div/div[@class="offer-display-feature-text a-spacing-none odf-truncation-popover"]/span/text()'
)
if not sold_by:
sold_by = html.xpath(
'//div[@id="merchantInfoFeature_feature_div"]/div/div[@class="offer-display-feature-text a-spacing-none odf-truncation-popover"]/span//text()'
)
status = html.xpath(
'//div[@id="availabilityInsideBuyBox_feature_div"]/div/div/span/text()'
)
free_delivery = html.xpath(
'//div[@id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/span/text()'
)
delivery_info = html.xpath(
'//div[@id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/text()'
)
star_level = html.xpath('//span[@id="acrPopover"]/@title')
brand = get_brand(html)
is_link_error = html.xpath('//div[@id="g"]/a/@href')
is_buy_now = html.xpath('//input[@id="buy-now-button"]/@name')
sku_image = html.xpath('//div[@id="imgTagWrapperId"]/img/@src')
if not sku_image:
sku_image = html.xpath('//div[@id="unrolledImgNo0"]//img/@src')
dimensions_display = get_dimensions_display(text)
asin = html.xpath('//input[@id="ASIN"]/@value')
title = title[0].strip() if len(title) else ""
# 先判断是否图书
paper_back = html.xpath('//div[@id="tmm-grid-swatch-PAPERBACK"]//a/@href')
hardcover = html.xpath('//div[@id="tmm-grid-swatch-HARDCOVER"]//a/@href')
is_book = False
if len(paper_back) or len(hardcover):
logger.info(f"图书")
skus = {}
if len(paper_back):
paper_back_asin = Tool.get_book_asin(paper_back[0], asin[0])
skus[paper_back_asin] = ["Paperback"]
if len(hardcover):
hardcover_asin = Tool.get_book_asin(hardcover[0], asin[0])
skus[hardcover_asin] = ["Hardcover"]
sku_name = skus.get(asin[0], "default")
is_book = True
title = title + " - " + sku_name[0]
sold_by = ["book"]
point_description = get_book_description(html)
descriptions = ""
else:
skus = re.findall(r"dimensionValuesDisplayData\" : ({[\s\S]*?}),", text)
sku_name = "default"
if len(skus) == 0:
skus = re.findall(r"dimensionValuesData\": ({[\s\S]*?}),", text)
if len(skus):
skus = json.loads(skus[0])
sku_name = get_sku_name(skus, asin[0], dimensions_display)
descriptions = get_description(html)
point_description = html.xpath('//div[@id="feature-bullets"]//ul//li//text()')
if len(point_description) == 0:
point_description = html.xpath(
'//ul[@class="a-unordered-list a-vertical a-spacing-small"]/span/li/span/text()'
)
images = []
max_image_number = 6
try:
color_images = get_color_images(text)
for item in color_images:
for image in color_images[item]:
if len(images) > max_image_number:
break
hi_res = image.get("hiRes")
if not hi_res:
hi_res = image.get("large")
if hi_res and hi_res not in images:
images.append(hi_res)
except:
logger.error("获取color_images失败")
data_json_color_images = data_json.get("colorImages", {})
for name, items in data_json_color_images.items():
for item in items:
if len(images) > max_image_number:
break
hi_res = item.get("hiRes", "")
if hi_res and hi_res not in images:
images.append(hi_res)
# 去除里面的SX355
sku_image = sku_image[0] if len(sku_image) else ""
image_size = sku_image.split(".")[-2]
sku_image = sku_image.replace(f"{image_size}", "_AC_SX1500_")
star_level = star_level[0] if len(star_level) else ""
star_level = star_level.replace(" out of 5 stars", "")
images = images[0:max_image_number]
category_names = html.xpath(
'//li/span[@class="a-list-item"]/a[@class="a-link-normal a-color-tertiary"]/text()'
)
category_name = ""
if category_names:
category_name = (
">>".join(category_names).strip().replace("\n", "").replace(" ", "")
)
parent_asin = re.findall(r'"parentAsin":"(.*?)"', text)
# 商品信息
product_information = {}
try:
product_info_table = html.xpath('//*[@id="productDetails_techSpec_section_1"]')
if not product_info_table:
product_info_table = html.xpath(
'//*[@id="productDetails_detailBullets_sections1"]'
)
if product_info_table:
rows = product_info_table[0].xpath(".//tr")
for row in rows:
th = row.xpath("./th/text()")
td = row.xpath("./td//text()")
if th and td:
key = th[0].strip()
value = td[0].strip()
product_information[key] = value
# 商品详情
detailBullets_feature_ul = html.xpath('//*[@id="detailBullets_feature_div"]/ul')
if detailBullets_feature_ul:
rows = detailBullets_feature_ul[0].xpath(".//li")
for row in rows:
li_parent_span = row.xpath("./span")
if li_parent_span:
spans = li_parent_span[0].xpath(".//span")
if spans:
key = spans[0].text.strip()
key = re.sub(r"[\u200e\u200f\u202c\u00a0]", "", key)
key = re.sub(r"\s+", " ", key).strip()
value = spans[1].text.strip()
value = re.sub(r"[\u200e\u200f\u202c\u00a0]", "", value)
value = re.sub(r"\s+", " ", value).strip()
product_information[key] = value
# 重要信息
important_information = html.xpath('//*[@id="important-information"]')
if important_information:
divs = important_information[0].xpath('.//div[@class="a-section content"]')
if divs:
for div in divs:
key = ""
key_options = [
div.xpath(".//h4/text()"),
div.xpath(".//span/text()"),
]
for option in key_options:
if option:
key = option
break
key = key[0].strip()
value = div.xpath(".//p/text()")[0].strip()
product_information[key] = value
# 页面sku模块的属性值
glance_icons_div = html.xpath('//*[@id="glance_icons_div"]')
if glance_icons_div:
glance_icons_td = glance_icons_div[0].xpath(".//td")
if glance_icons_td:
for td in glance_icons_td:
child_td = td.xpath(".//td")
if child_td:
real_td = child_td[1]
spans = real_td.xpath(".//span")
if spans:
key = spans[0].text.strip()
value = "".join(spans[1].itertext()).strip()
product_information[key] = value
# 页面sku模块的商品详情
product_facts_desktop_expander = html.xpath(
'//*[@id="productFactsDesktopExpander"]'
)
if product_facts_desktop_expander:
product_detail_exist = False
h3 = product_facts_desktop_expander[0].xpath(".//h3/text()")
if h3:
for _h3 in h3:
if "Product details" in _h3:
product_detail_exist = True
break
if product_detail_exist:
ul = product_facts_desktop_expander[0].xpath(".//ul")
if ul:
li = ul[0].xpath(".//li")
if li:
for _li in li:
div = _li.xpath('.//div[@class="a-fixed-left-grid-inner"]')
if div:
child_div = div[0].xpath(".//div")
if child_div and len(child_div) == 2:
key = (
child_div[0]
.xpath(".//span/span/text()")[0]
.strip()
)
value = (
child_div[1]
.xpath(".//span/span/text()")[0]
.strip()
)
product_information[key] = value
except Exception as e:
result_asin = asin[0] if len(asin) else ""
logger.error(
f"获取商品信息失败: {e}, asin: {result_asin}\n{traceback.format_exc()}"
)
free_delivery = Fmt.parse_date(free_delivery[0]) if len(free_delivery) else ""
status = Fmt.parse_status(status[0]) if len(status) else ""
return {
"title": title,
"images": images,
"point_description": point_description,
"descriptions": descriptions,
"skus": skus,
"is_unavailable": is_unavailable[0] if len(is_unavailable) else "",
"price": price[0] if len(price) else "",
"ship_from": ship_from[0] if len(ship_from) else "",
"sold_by": sold_by[0] if len(sold_by) else "",
"status": status,
"free_delivery": free_delivery,
"delivery_info": delivery_info[0] if len(delivery_info) else "",
"brand": brand[0].strip() if len(brand) else "",
"is_link_error": is_link_error[0] if len(is_link_error) else "",
"is_buy_now": is_buy_now[0] if len(is_buy_now) else "",
"sku_image": sku_image,
"asin": asin[0] if len(asin) else "",
"sku_name": sku_name,
"star_level": star_level,
"category_name": category_name,
"parent_asin": parent_asin[0] if len(parent_asin) else "",
"is_book": is_book,
"product_information": product_information,
}
def get_save_skus_param(response):
"""
获取保存sku参数
:param response:
:return:
"""
return {
"price": response["price"],
"ship_from": response["ship_from"],
"sold_by": response["sold_by"],
"status": response["status"],
"free_delivery": response["free_delivery"],
"delivery_info": response["delivery_info"],
"is_link_error": response["is_link_error"],
"is_buy_now": response["is_buy_now"],
"sku_image": response["sku_image"],
"sku_name": response["sku_name"],
"asin": response["asin"],
"is_unavailable": response["is_unavailable"],
"star_level": response["star_level"],
"title": response.get("title", ""),
"description": response.get("descriptions", ""),
"images": response.get("images", []),
"point_description": response.get("point_description", ""),
"parent_asin": response.get("parent_asin", ""),
"product_information": response.get("product_information", {}),
}
def replace_chinese(_text):
"""
去除字符串里的中文内容
:param _text:
:return:
"""
return re.sub("[\u4e00-\u9fa5]", "", _text)
def desensitization(_text):
"""
脱敏处理
:param _text:
:return:
"""
return (
_text.replace("100%", "")
.replace("amazon", "")
.replace("Amazon", "")
.replace("AMAZON", "")
)
def replace_case(_text, sub_str, replace_str):
"""
替换指定字符串,不区分大小写
:param _text:
:param sub_str:
:param replace_str:
:return:
"""
compileObj = re.compile(re.escape(sub_str), re.IGNORECASE)
resultantStr = compileObj.sub(replace_str, _text)
return resultantStr
def remove_diacritics(input_str):
"""
去除字符串中的变音符
:param input_str:
:return:
"""
nfkd_form = unicodedata.normalize("NFKD", input_str)
return "".join([char for char in nfkd_form if not unicodedata.combining(char)])
class Goods:
site = Site.com
postcode = Postcode.com
lang = Lang.com
stock_status = StockStatus.com
def __init__(self, singleton):
self.task_key = task_monitoring_config["task_key"]
self.item_key = task_monitoring_config["item_key"]
self.task_number = int(task_monitoring_config["task_number"])
self.enabled = task_monitoring_config["enabled"] == "True"
self.request_timeout = int(task_monitoring_config["request_timeout"])
self.is_debug = task_monitoring_config["is_debug"] == "True"
self.proxy_manager = ProxyManager(singleton)
self.task_manager = Task(singleton)
self.redis_singleton = singleton
def get_proxy(self):
"""
:return:
"""
# if self.is_debug:
# test_proxy = "127.0.0.1:7890"
# proxy = "#1#2#127.0.0.1:7890"
# else:
proxy = self.proxy_manager.get_proxy()
if proxy is None:
return None
test_proxy = proxy.split("#")[0]
return {
"proxy": f"chensav:chensav@{test_proxy}",
"temp_proxy": proxy,
}
def join_proxy(self, proxy):
"""
加入代理
:param proxy:
:return:
"""
return self.proxy_manager.join_proxy(proxy)
def delete_proxy(self, proxy):
"""
删除代理
:param proxy:
:return:
"""
proxy_name = proxy.split("#")[1]
return self.proxy_manager.delete_proxy(proxy_name)
def cookie_error(self):
"""
cookie错误
:return:
"""
redis = redis_singleton.get_connection()
redis.incr("amazon:cookie-error")
def callback(self):
pass
def content(self, url: str):
"""
获取商品详情
:param url:
:return:
"""
response = None
for _ in range(20):
asin = Tool.get_url_asin(url)
cache_key = f"cache:amazon:goods:{asin}"
cache_time = 60 * 30
response = None
if not self.is_debug:
# 判断是否存在缓存
cache_data = Cache.get_cache(cache_key)
if cache_data:
response = json.loads(cache_data)
break
url = f"https://www.amazon.{self.site}/dp/" + asin + "?th=1&psc=1"
_proxy = self.get_proxy()
if _proxy is None:
logger.error("等待代理")
number = random.randint(3, 6)
sleep(number)
continue
try:
headers = self.task_manager.get_loca_cookie(site=self.site)
text = Request.request_html(
url,
_proxy["proxy"],
**{"headers": headers, "timeout": self.request_timeout, "postcode": self.postcode},
)
self.join_proxy(_proxy["temp_proxy"])
response = format_content(text, self.lang, self.stock_status)
Cache.set_cache(cache_key, json.dumps(response), cache_time)
break
except curl_cffi.curl.CurlError:
logger.error(f"请求超时: {url}")
except Exception as e:
if str(e) == "出现验证码":
self.delete_proxy(_proxy["temp_proxy"])
if str(e) == "采集邮编错误":
self.cookie_error()
logger.error(f"请求异常: {e} - {url}")
self.join_proxy(_proxy["temp_proxy"])
return response
def run(self, task):
"""
运行任务
:param task:
:return:
"""
url = task.get("url", "")
logger.info(f"采集任务: {url}")
max_sku_number = task.get("max_sku_number", 15)
response = self.content(url)
if not response:
logger.error(f"采集失败: {url}")
return
admin_users_id = task["admin_users_id"]
task_id = task["id"]
asin_list = []
result_sku = []
brand = list()
category_name = response.get("category_name", "")
sku_name = response.get("sku_name", "")
default_brand = ""
max_sku_number = 50 if max_sku_number > 50 else max_sku_number
max_sku_number = 25 if max_sku_number < 25 else max_sku_number
is_book = response.get("is_book", False)
if response["skus"]:
if len(response["skus"]) > int(max_sku_number):
logger.info(f"SKU数: {len(response['skus'])}, 跳过采集")
return
collection_skus = []
default_asin = response["asin"]
if len(response["skus"]) > 0:
for asin, sku in response["skus"].items():
if asin == default_asin:
if response["brand"]:
brand.append(response["brand"])
save_response = get_save_skus_param(response)
asin_list.append(save_response)
continue
if len(asin) > 12:
continue
url = f"https://www.amazon.{self.site}/dp/" + asin + "?th=1&psc=1"
collection_skus.append(self.content(url))
# 分批
if len(collection_skus) > 0:
for i in range(0, len(collection_skus), 8):
for response in asyncio.gather(
*collection_skus[i : i + 8]
):
try:
if response.get("brand"):
brand.append(response["brand"])
if response["category_name"] != "Back to results":
category_name = response["category_name"]
save_response = get_save_skus_param(response)
asin_list.append(save_response)
except Exception as e:
traceback.print_exc()
logger.error(f"采集商品详情失败: {url} : {e}")
else:
brand.append(response["brand"])
response = get_save_skus_param(response)
asin_list.append(response)
if len(brand):
default_brand = brand[0]
for goods_sku in asin_list:
images = goods_sku["images"]
images = images[0:6]
title = goods_sku["title"]
title = html.unescape(title)
# 去除标题的100%字样
title = desensitization(title)
new_point_description = ""
point_description = goods_sku["point_description"]
descriptions = goods_sku["description"]
if len(point_description):
for item in point_description:
item = replace_chinese(item)
item = html.unescape(item)
item = desensitization(item)
new_point_description += f"<li>{item}</li>"
if default_brand:
brand_list = default_brand.split(" ")
replace_brands = list()
replace_brands.append(default_brand)
for brand_ in brand_list:
replace_brands.append(brand_)
for replace_brand in replace_brands:
descriptions = replace_case(descriptions, replace_brand, "")
new_point_description = replace_case(
new_point_description, replace_brand, ""
)
title = replace_case(title, replace_brand, "")
# 替换标题中的品牌
diacritics_brand = remove_diacritics(replace_brand)
descriptions = replace_case(
descriptions, diacritics_brand, ""
)
new_point_description = replace_case(
new_point_description, diacritics_brand, ""
)
title = replace_case(title, diacritics_brand, "")
goods_sku["title"] = title
goods_sku["point_description"] = new_point_description
goods_sku["description"] = descriptions
goods_sku["images"] = images
goods_sku["brand"] = default_brand
result_sku.append(goods_sku)
# logger.info(f"采集类目:{category_name}")
if category_name == "Back to results":
logger.error(f"采集分类有问题 - {url}")
# 如果是图书
if is_book:
result = []
for item in result_sku:
result.append(
{
"brand": default_brand,
"category_name": category_name,
"skus": [item],
"sku_name": item["sku_name"],
"is_book": is_book,
}
)
else:
result = [
{
"brand": default_brand,
"category_name": category_name,
"skus": result_sku,
"sku_name": sku_name,
"is_book": is_book,
}
]
for result_item in result:
data = {
"data": {
"task_id": task_id,
"collection_data": result_item,
"log_id": task["log_id"],
"platform_type": task.get("platform_type", 1),
"export_type": task.get("export_type", 1),
"country": self.lang[-2:]
},
"type": 4,
"admin_users_id": admin_users_id,
"app_name": task.get("app_name", "admin"),
"project": task.get("project", "tiktok"),
}
if self.is_debug:
logger.success(f"采集结果: {json.dumps(data, ensure_ascii=False)}")
logger.success(f"回调任务ID: {task_id}")
self.task_manager.callback_task(self.item_key, data)
return result
def main(self):
"""
:return:
"""
if not self.is_debug:
proxy_number = self.proxy_manager.get_proxy_number()
if proxy_number <= 10:
logger.info("代理数量不足")
sleep(3)
return
if not self.enabled:
logger.info("发布任务未开启")
sleep(3)
sys.exit(0)
if self.is_debug:
tasks = [
'{"id": 252, "admin_users_id": 1, "log_id": 11, "platform_type": 1, "callback_type": 9, "app_name": "admin", "max_sku_number": 35, "project": "temu", "url": "https://www.amazon.com/dp/B0F4PHMSLN?th=1"}',
]
else:
tasks = self.task_manager.get_task(self.task_key, self.task_number)
if not tasks:
logger.info("任务队列为空")
sleep(5)
return
queue = []
start_time = time.time()
tasks = [json.loads(task) for task in tasks]
for task in tasks:
queue.append(self.run(task))
if queue:
asyncio.gather(*queue)
logger.info(f"任务耗时: {time.time() - start_time}")
if self.is_debug:
sys.exit(0)
from time import sleep
import asyncio
import json
import re
import time
import traceback
import curl_cffi
from loguru import logger
from lxml import etree
from conf import config
from const import Lang, Postcode, Site, StockStatus
from db import RedisSingleton
from proxy import ProxyManager
from tool import Fmt, Task, Request
redis_config = config['redis']
task_monitoring_config = config['task-monitoring']
redis_singleton = RedisSingleton(redis_url=config["redis"]["url"])
class Tool:
@staticmethod
def get_amazon_sku(text):
"""
获取amazon_sku
:param text:
:return:
"""
amazon_skus = re.findall(r"dimensionValuesDisplayData\" : ({[\s\S]*?}),", text)
if len(amazon_skus) == 0:
amazon_skus = re.findall(r"dimensionValuesData\": ({[\s\S]*?}),", text)
if len(amazon_skus):
amazon_skus = json.loads(amazon_skus[0])
return amazon_skus
@staticmethod
def get_url_asin(url: str):
patterns = [
'dp/(.+?)\?',
'dp/(.+?)\/',
'dp/(.+)'
]
asin = ""
for pattern in patterns:
asin = re.findall(pattern, url)
if len(asin):
asin = asin[0]
if len(asin) > 10:
asin = asin.split("/")[0]
break
return asin
@staticmethod
def get_data_json(text):
"""
获取data_json
:param text:
:return:
"""
data_json = re.findall('jQuery.parseJSON\(\'(.*)\'\)', text)
data_json = data_json[0].replace("\\'", "'")
return json.loads(data_json)
class Cache:
@staticmethod
def get_cache(key: str):
"""
获取缓存
:param key:
:return:
"""
redis = redis_singleton.get_connection()
return redis.get(key)
@staticmethod
def set_cache(key: str, value: str, expire: int = 60):
"""
设置缓存
:param key:
:param value:
:param expire:
:return:
"""
redis = redis_singleton.get_connection()
redis.set(key, value, ex=expire)
def format_result(result: dict, task: dict, lang = Lang.com, stock_status = StockStatus.com):
"""
格式化结果
:param result:
:param task:
:return:
"""
if not result:
return {}
admin_users_id = task.get('admin_users_id')
item_id = task.get('item_id')
app_name = task.get('app_name', 'admin')
title = result.get('title', '')
if title:
title = title.strip()
free_delivery = Fmt.parse_date(result.get('free_delivery'))
status = Fmt.parse_status(result.get('status'))
return {
"price": result['price'],
"parent_asin": result.get('parent_asin', ''),
'category_name': result['category_name'],
'category_names': result['category_names'],
"ship_from": result['ship_from'],
"sold_by": result['sold_by'],
"status": status,
"free_delivery": free_delivery,
"delivery_info": result['delivery_info'],
"is_link_error": result['is_link_error'],
"sku_name": result['sku_name'],
"admin_users_id": admin_users_id,
"item_id": item_id,
"asin": result['asin'],
"is_unavailable": result['is_unavailable'],
"star_level": result['star_level'],
'app_name': app_name,
'step_value': task.get('step_value'),
'title': title,
"country": lang[-2:]
}
def get_title(html):
"""
获取标题
:param html:
:return:
"""
title = html.xpath('//span[@id="productTitle"]/text()')
if len(title) == 0:
title = html.xpath('//span[@id="bond-title-desktop"]/text()')
return title
def format_content(text: str = "", is_sku: bool = True):
"""
获取商品详情
:param is_sku:
:param text:
:return:
"""
html = etree.HTML(text)
is_link_error = html.xpath('//div[@id="g"]/a/@href')
skus = []
sku_name = 'default'
amazon_skus = []
# 判断url是否被删除
if len(is_link_error) == 0:
amazon_skus = Tool.get_amazon_sku(text)
data_json = Tool.get_data_json(text)
if is_sku:
color_to_asin = data_json['colorToAsin']
for item in color_to_asin:
skus.append(color_to_asin[item]['asin'])
if len(skus) == 0:
for item in amazon_skus:
skus.append(item)
is_unavailable = html.xpath(
'//div[@class="a-section a-spacing-small a-text-center"]//span[@class="a-color-price a-text-bold"]/text()')
price = html.xpath('//input[@id="twister-plus-price-data-price"]/@value')
ship_from = html.xpath(
'//div[@id="fulfillerInfoFeature_feature_div"]/div/div[@class="offer-display-feature-text a-spacing-none"]/span/text()')
sold_by = html.xpath(
'//div[@id="merchantInfoFeature_feature_div"]/div/div[@class="offer-display-feature-text a-spacing-none"]/span//text()')
if not sold_by:
sold_by = html.xpath(
'//div[@id="merchantInfoFeature_feature_div"]/div/div[@class="offer-display-feature-text a-spacing-none "]/span//text()')
if not ship_from:
ship_from = html.xpath(
'//div[@id="fulfillerInfoFeature_feature_div"]/div/div[@class="offer-display-feature-text a-spacing-none "]/span/text()')
if not ship_from:
ship_from = html.xpath(
'//div[@id="fulfillerInfoFeature_feature_div"]/div/div[@class="offer-display-feature-text a-spacing-none odf-truncation-popover"]/span/text()')
if not sold_by:
sold_by = html.xpath(
'//div[@id="merchantInfoFeature_feature_div"]/div/div[@class="offer-display-feature-text a-spacing-none odf-truncation-popover"]/span//text()')
status = html.xpath('//div[@id="availabilityInsideBuyBox_feature_div"]/div/div/span/text()')
free_delivery = html.xpath(
'//div[@id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/span/text()')
delivery_info = html.xpath('//div[@id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/text()')
brand = html.xpath('//tr[@class="a-spacing-small po-brand"]//td[2]/span/text()')
star_level = html.xpath('//span[@id="acrPopover"]/@title')
star_level = star_level[0] if len(star_level) else ''
star_level = star_level.replace(' out of 5 stars', '')
asin = html.xpath('//input[@id="ASIN"]/@value')
is_buy_now = html.xpath('//input[@id="buy-now-button"]/@name')
category_names = html.xpath('//li/span[@class="a-list-item"]/a[@class="a-link-normal a-color-tertiary"]/text()')
if not is_buy_now:
is_buy_now = html.xpath('//input[@id="add-to-cart-button"]/@name')
if not is_buy_now:
is_unavailable = ['Out of stock']
category_name = ""
if category_names:
category_name = category_names[-1].strip()
category_names = ">>".join(category_names).strip().replace("\n", "").replace(" ", "")
parent_asin = re.findall(r'"parentAsin":"(.*?)"', text)
title = get_title(html)
# 判断是否是Sku采集
upload_data = {
"price": price,
"parent_asin": parent_asin,
"ship_from": ship_from,
"sold_by": sold_by,
"status": status,
"free_delivery": free_delivery,
"delivery_info": delivery_info,
"brand": brand,
"is_unavailable": is_unavailable,
"is_link_error": is_link_error,
"asin": asin,
"title": title,
}
for key, item in upload_data.items():
try:
upload_data[key] = upload_data[key][0] if len(upload_data[key]) else None
except Exception:
upload_data[key] = None
upload_data['skus'] = skus
upload_data['star_level'] = star_level
upload_data['category_name'] = category_name
upload_data['category_names'] = category_names
paper_back = html.xpath('//div[@id="tmm-grid-swatch-PAPERBACK"]//a/@href')
hardcover = html.xpath('//div[@id="tmm-grid-swatch-HARDCOVER"]//a/@href')
if len(paper_back) or len(hardcover):
if len(paper_back) and paper_back[0] == 'javascript:void(0)':
sku_name = ['Paperback']
if len(hardcover) and hardcover[0] == 'javascript:void(0)':
sku_name = ['Hardcover']
upload_data['sold_by'] = 'book'
else:
if len(amazon_skus):
sku_name = amazon_skus.get(upload_data['asin'], "default")
upload_data['sku_name'] = sku_name
return upload_data
class Monitoring:
site = Site.com
postcode = Postcode.com
lang = Lang.com
stock_status = StockStatus.com
def __init__(self, singleton):
self.task_key = task_monitoring_config['task_key']
self.item_key = task_monitoring_config['item_key']
self.task_number = int(task_monitoring_config['task_number'])
self.enabled = task_monitoring_config['enabled'] == 'True'
self.request_timeout = int(task_monitoring_config['request_timeout'])
self.proxy_manager = ProxyManager(singleton)
self.task_manager = Task(singleton)
self.redis_singleton = singleton
self.is_debug = task_monitoring_config['is_debug'] == 'True'
def get_proxy(self):
"""
:return:
"""
# if self.is_debug:
# test_proxy = '127.0.0.1:7890'
# proxy = '#1#2#127.0.0.1:7890'
# else:
proxy = self.proxy_manager.get_proxy()
if proxy is None:
return None
test_proxy = proxy.split('#')[0]
return {
"proxy": f"chensav:chensav@{test_proxy}",
"temp_proxy": proxy,
}
def join_proxy(self, proxy):
"""
加入代理
:param proxy:
:return:
"""
return self.proxy_manager.join_proxy(proxy)
def delete_proxy(self, proxy):
"""
删除代理
:param proxy:
:return:
"""
proxy_name = proxy.split('#')[1]
return self.proxy_manager.delete_proxy(proxy_name)
def run(self, task):
"""
获取商品详情
:param task:
:return:
"""
step_value = task.get("step_value", None)
if step_value is None:
task['step_value'] = time.strftime('%m%d%H%M')
task['is_first'] = True
url = task.get('url', '')
asin = Tool.get_url_asin(url)
# 判断是否存在缓存
cache_key = f"cache:amazon:search:{asin}"
cache_data = Cache.get_cache(cache_key)
cache_data = None
is_first = True if task.get('is_first', True) else False
cache_time = int(task.get('cache_time', (60 * 60 * 2)))
if cache_data and is_first is False:
response = json.loads(cache_data)
else:
url = f"https://www.amazon.{self.site}/dp/" + asin + "?th=1&psc=1"
response = {}
_proxy = self.get_proxy()
if _proxy is None:
logger.error('代理获取失败,重新入队')
self.task_manager.add_task(task, self.task_key)
return
try:
headers = self.task_manager.get_loca_cookie(self.site)
text = Request.request_html(url, _proxy['proxy'],
**{"headers": headers, "timeout": self.request_timeout,
"is_product_detail": True, "postcode": self.postcode})
response = format_content(text, is_first)
if not response.get('status', None) and not response.get('price', None) and not response.get(
'is_unavailable', None) and not response.get('title', None):
logger.error(f"采集商品价格有误: {url}")
raise curl_cffi.curl.CurlError('请求超时')
if is_first is False:
Cache.set_cache(cache_key, json.dumps(response), cache_time)
except curl_cffi.curl.CurlError:
logger.error(f"请求超时: {url}")
self.task_manager.add_task(task, self.task_key)
except Exception as e:
if str(e) == '出现验证码':
self.delete_proxy(_proxy['temp_proxy'])
logger.error(f"请求异常: {e}")
self.task_manager.add_task(task, self.task_key)
self.join_proxy(_proxy['temp_proxy'])
if response:
skus = response.get('skus', [])
for sku in skus:
if sku == asin:
continue
temp = {
'url': f"https://www.amazon.{self.site}/dp/" + sku + "?th=1&psc=1",
'is_first': False,
}
self.task_manager.add_task({**task, **temp}, self.task_key)
try:
if not response:
return
task = task or {}
result = format_result(response, task, self.lang, self.stock_status)
if self.is_debug:
logger.success(f'回调内容:{result}')
self.task_manager.callback_task(self.item_key, result)
return result
except Exception as e:
logger.error(f"回调异常: {e}")
logger.debug(traceback.format_exc())
self.task_manager.add_task(task, self.task_key)
def main(self):
if not self.is_debug:
proxy_number = self.proxy_manager.get_proxy_number()
if proxy_number <= 1:
logger.info('代理数量不足')
sleep(3)
return
if not self.enabled:
logger.info('监控任务未开启')
return
if self.is_debug:
tasks = [
'{"url":"https://www.amazon.com/dp/B0F4PHMSLN","item_id":"27","admin_users_id":1,"callback_type":5,"collection_type":1,"app_name":"admin","cache_time":60}']
else:
tasks = self.task_manager.get_task(self.task_key, self.task_number)
if not tasks:
logger.info('任务队列为空')
sleep(5)
return
queue = []
start_time = time.time()
tasks = [json.loads(task) for task in tasks]
for task in tasks:
queue.append(self.run(task))
success_number = 0
logger.info(f"任务数: {len(queue)}")
if queue:
for items in asyncio.gather(*queue):
success_number += 1
logger.info(f"任务耗时: {time.time() - start_time}, 成功数: {success_number}")
from time import sleep
import asyncio
import functools
import json
import os
import re
import sys
import time
import urllib.parse
import uuid
import requests as _requests
from curl_cffi import requests
from loguru import logger
from lxml import etree
from conf import config
from const import Lang, Postcode, Site
from db import RedisSingleton
from proxy import ProxyManager
from tool import Fmt, Task, Request, Proxy
redis_config = config['redis']
task_config = config['task-search']
redis_singleton = RedisSingleton(redis_url=config["redis"]["url"])
IS_DEBUG = int(os.environ.get("IS_DEBUG", False))
TASK_TYPE_AMAZON = 1
def format_best_seller(text):
"""
格式化畅销榜数据
:param text:
:return:
"""
html = etree.HTML(text)
search_list = html.xpath('//div[@id="gridItemRoot"]//div[@class="zg-grid-general-faceout"]')
gathered = []
result = []
for item in search_list:
title = item.xpath('.//span/div/text()')
image = item.xpath('.//img/@src')
price = item.xpath('.//span/span/text()')
new_delivery = [
"FREE delivery",
"Tomorrow",
"on $35 of items shipped by Amazon",
"Or fastest delivery",
"Tomorrow"
]
asin = item.xpath('./div/@id')
review_count = item.xpath('.//a//span[@class="a-size-small"]/text()')
star_level = item.xpath('.//a/i/span/text()')
star_level = star_level[0] if len(star_level) else ""
star_level = star_level.replace(' out of 5 stars', '')
asin = (asin[0] if len(asin) else "")
result.append({
"title": (title[0] if len(title) else ""),
"price": (Fmt.parse_price(price[0]) if len(price) else ""),
"delivery": new_delivery,
"review_count": (review_count[0] if len(review_count) else ""),
"unique_value": asin,
"image": (image[0] if len(image) else ""),
"is_prime": "",
"star_level": star_level,
})
gathered.append(asin)
return result, gathered
def amazon_best_sellers(text, site: str = "com"):
"""
亚马逊畅销榜
:param text:
:return:
"""
tok = re.findall('data-acp-params="(.+?)"', text)
html = etree.HTML(text)
recs_list = html.xpath('//div[@data-a-card-type="basic"]/div/@data-client-recs-list')
if len(recs_list) == 0:
raise Exception("畅销榜没有数据")
asins = json.loads(recs_list[0])
result, gathered = format_best_seller(text)
# 不存在的获取
ids = []
for asin in asins:
if asin['id'] in gathered:
continue
ids.append(json.dumps(asin))
if len(ids):
for i in range(20):
logger.info(f"获取畅销榜第{i + 1}次")
url = f"https://www.amazon.{site}/acp/p13n-zg-list-grid-desktop/p13n-zg-list-grid-desktop-a39b75ff-7640-424e-be75-273399fcd19b-1718097743426/nextPage"
payload = {
"faceoutkataname": "GeneralFaceout",
"ids": ids,
"indexes": [
],
"linkparameters": "",
"offset": "1",
"reftagprefix": "zg_bs_g_appliances"
}
headers = {
'accept': 'text/html, application/json',
'origin': f'https://www.amazon.{site}',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
'x-amz-acp-params': tok[0],
'x-requested-with': 'XMLHttpRequest',
'content-type': 'application/json',
'cookie': 'session-id=133-9000729-4069415; i18n-prefs=USD; ubid-main=133-9231024-1932862; skin=noskin; lc-main=en_US; kndctr_7742037254C95E840A4C98A6_AdobeOrg_identity=CiYzMTIzODU1NzQ3OTgxOTQ4MjU4NDIyODIzNDE4NzE5ODkzNTc5NVIRCNzDmo7sMRgBKgRTR1AzMAHwAdzDmo7sMQ==; AMCV_7742037254C95E840A4C98A6%40AdobeOrg=MCMID|31238557479819482584228234187198935795; session-id-time=2082787201l; ld=AZUSSOA-yaflyout; s_pers=%20s_fid%3D6E30972CB9C22290-3EAA27AEFFDCE1E4%7C1881886732593%3B%20s_dl%3D1%7C1724122132593%3B%20gpv_page%3DUS%253AAZ%253ASOA-overview-sell%7C1724122132593%3B%20s_ev15%3D%255B%255B%2527AZUSSOA-yaflyout%2527%252C%25271724120332596%2527%255D%255D%7C1881886732596%3B; s_sess=%20c_m%3DAZUSSOA-yaflyoutundefinedAmazon.comundefined%3B%20s_cc%3Dtrue%3B%20s_ppvl%3DUS%25253AAZ%25253ASOA-overview-sell%252C24%252C11%252C2589%252C2133%252C1145%252C1920%252C1200%252C0.9%252CL%3B%20s_ppv%3DUS%25253AAZ%25253ASOA-overview-sell%252C24%252C11%252C2589%252C2133%252C1145%252C1920%252C1200%252C0.9%252CL%3B; session-token=eZ27y8iWCeBux4Xx7wDTfY46zzxkepHUYlsJeldb1+O7QsWoggMMC3kiGx5u1P1N3hju/iONQFabxEHbLNRU6ooYRT7rSPFhZwKxv9FWTEwkYmlxWBxB5FNXYkdmrLftH6ijWddfZGzJ3ZCzKhPBSvBVs89Us2p7DFMVuivOsKqj+gCubq8QKz7OQXGyiiooXQ3gn3L8NnuCQuBRWYJcZUzTol/DHCWkpCsOjBc6CWJm5wbig+UVXXypf7AxWrJOK2ZoPvRs8DZRTuR7xvp1zPRYEYP9mi+XtAaoIraf9bcrFXrfTuUc4yeJBUSBLO+bkRK7YFk8L2QOIGBdcsCQ0JP5bd/7QgUZ; csm-hit=tb:N41G6HHV91WK66N7APDF+s-N41G6HHV91WK66N7APDF|1724909616666&t:1724909616666&adb:adblk_no',
}
try:
ip = Proxy.get_zhan_proxies()
proxy = f"http://{ip}"
proxies = {
"https": proxy,
"http": proxy,
}
response = requests.request("POST", url, headers=headers, data=json.dumps(payload), proxies=proxies, timeout=20)
new_result, _ = format_best_seller(response.text)
result = result + new_result
break
except Exception as e:
logger.error(f"获取畅销榜失败: {e}")
return result
def get_url_asin(url: str):
patterns = [
'dp/(.+?)\?',
'dp/(.+?)\/',
'dp/(.+)'
]
asin = ""
for pattern in patterns:
asin = re.findall(pattern, url)
if len(asin):
asin = asin[0]
if len(asin) > 10:
asin = asin.split("/")[0]
break
return asin
def amazon_detail(asin, proxy, site: str = "com", **kwargs):
"""
亚马逊详情页
:param asin:
:param proxy:
:param kwargs:
:return:
"""
w_names = {
'sp_detail_thematic1': {
"url": f'https://www.amazon.{site}/sspa/paginate?widgetLocale=en_US&themeSelector=highly_rated&showHelpfulSentence=false&ASIN=[asin]&wName=sp_detail_thematic&widgetGroup=desktop-dp-sims&isMultiPlacementRequest=true&doNotShowProductAttributes=false&oData=[oData]',
},
'sp_detail_thematic2': {
"url": f"https://www.amazon.{site}/sspa/paginate?themeSelector=top_brands&ASIN=[asin]&wName=sp_detail_thematic&widgetGroup=desktop-dp-sims&isMultiPlacementRequest=true&doNotShowProductAttributes=false&oData=[oData]"
},
}
result = []
unique_asins = []
for name, value in w_names.items():
o_data = ""
for i in range(1, 10):
try:
url = value['url'].replace("[asin]", asin).replace("[oData]", o_data)
logger.info(f"详情页: {name} 获取第{i}页, {url}")
detail = Request.request_html(url, proxy, **kwargs)
detail = json.loads(detail)
if len(detail['data']) == 0:
logger.error(f"相似商品没有数据")
break
for da in detail['data']:
o_data += da['oid']
content = da['content']
etree_html = etree.HTML(content)
detail = etree_html.xpath("//div[@class='a-section sp_offerVertical sp_ltr_offer']/@data-adfeedbackdetails")
if len(detail) == 0:
continue
new_delivery = [
"FREE delivery",
"Tomorrow",
"on $35 of items shipped by Amazon",
"Or fastest delivery",
"Tomorrow"
]
detail = json.loads(detail[0])
if detail['asin'] in unique_asins:
continue
unique_asins.append(detail['asin'])
result.append({
"title": detail['title'],
"price": detail['priceAmount'],
"delivery": new_delivery,
"review_count": 1000,
"unique_value": detail['asin'],
"image": detail['adCreativeImage']['highResolutionImages'][0]['url'],
"is_prime": "",
"star_level": 4.1,
})
except Exception as e:
logger.error(f"获取相似商品: {e}")
return result
def get_url_query_page(url: str):
"""
获取url的query参数
:param url:
:return:
"""
temp_url = urllib.parse.urlparse(url)
temp_url = urllib.parse.parse_qs(temp_url.query)
if 'page' not in temp_url:
return 1
return temp_url['page'][0]
def amazon_search(text):
"""
亚马逊搜索
:param text:
:return:
"""
html = etree.HTML(text)
search_list = html.xpath(
'//div[@class="s-main-slot s-result-list s-search-results sg-row"]//div[@data-component-type="s-search-result"]')
result = []
for item in search_list:
title = item.xpath('.//h2[@class="a-size-mini a-spacing-none a-color-base s-line-clamp-2"]//span/text()')
if len(title) == 0:
title = item.xpath('.//span[@class="a-size-base-plus a-color-base a-text-normal"]/text()')
if len(title) == 0:
title = item.xpath('.//span[@class="a-size-base-plus a-color-base a-text-normal"]/text()')
if len(title) == 0:
title = item.xpath('.//h2[@class="a-size-base-plus a-spacing-none a-color-base a-text-normal"]//span/text()')
if len(title) == 0:
title = item.xpath('.//h2[@class="a-size-medium a-spacing-none a-color-base a-text-normal"]/span/text()')
price = item.xpath('.//span[@class="a-offscreen"]/text()')
delivery = item.xpath('.//div[@data-cy="delivery-recipe"]//div[@class="a-column a-span12"]//text()')
if not delivery:
delivery = item.xpath('.//div[@data-cy="delivery-recipe"]//div[@class="a-row a-size-base a-color-secondary s-align-children-center"]//text()')
review_count = item.xpath('.//span[@class="a-icon-alt"]/text()')
asin = item.xpath('./@data-asin')
image = item.xpath('.//img[@class="s-image s-image-optimized-rendering"]/@src')
if len(image) == 0:
image = item.xpath(
'.//div[@class="a-section aok-relative s-image-square-aspect"]/img[@class="s-image"]/@src')
if len(image) == 0:
image = item.xpath(
'.//div[@class="a-section aok-relative s-image-fixed-height"]/img[@class="s-image"]/@src')
if len(image) == 0:
image = item.xpath('.//img[@class="s-image"]/@src')
is_prime = item.xpath('.//i[@class="a-icon a-icon-prime a-icon-medium"]/@aria-label')
star_level = item.xpath('.//span[@class="a-icon-alt"]/text()')
star_level = star_level[0] if len(star_level) else ""
star_level = star_level.replace(' out of 5 stars', '')
new_delivery = []
for i in range(len(delivery)):
delivery[i] = delivery[i].strip()
if delivery[i]:
new_delivery.append(delivery[i])
result.append({
"title": (title[0] if len(title) else ""),
"price": (Fmt.parse_price(price[0]) if len(price) else ""),
"delivery": new_delivery,
"review_count": (review_count[0] if len(review_count) else ""),
"unique_value": (asin[0] if len(asin) else ""),
"image": (image[0] if len(image) else ""),
"is_prime": (is_prime[0] if len(is_prime) else ""),
"star_level": star_level,
})
return result
def get_search_category(text):
paths = [
'//ul[@aria-labelledby="n-title"]/span/span[1]//li/span/a/span/text()',
'//ul[@id="filter-n"]/span/span[1]//li/span/a/span/text()',
]
keyword = None
for path in paths:
category = text.xpath(path)
if len(category):
category = category[1] if len(category) > 1 else category[0]
keyword = category.strip().replace(' ', '+')
break
return keyword
def category_search(temp_url, text, site: str = "com", **kwargs):
"""
分类搜索
:param temp_url:
:param text:
:return:
"""
html = etree.HTML(text)
keyword = get_search_category(html)
if keyword is None:
logger.error(f"没有分类")
return []
page = get_url_query_page(temp_url)
url = f"https://www.amazon.{site}/s?k={keyword}&page={page}"
logger.info(f"获取分类: {url}")
proxy = kwargs.get('proxy', None)
headers = kwargs.get('headers', None)
text = Request.request_html(url, proxy, **{"headers": headers})
return amazon_search(text)
def get_best_seller_url(text):
"""
获取畅销榜url
:param text:
:return:
"""
best_seller_url = text.xpath('//ul[@class="a-unordered-list a-nostyle a-vertical zg_hrsr"]/li/span/a/@href')
if len(best_seller_url) == 0:
urls = text.xpath('//table[@id="productDetails_detailBullets_sections1"]//tr//@href')
for url in urls:
if "bestsellers" in url:
best_seller_url.append(url)
if len(best_seller_url) > 0:
best_seller_url = best_seller_url[-1]
else:
best_seller_url = best_seller_url[0]
return best_seller_url
def product_best_sellers(text, proxy, headers, site: str = "com"):
"""
产品畅销榜
:param proxy:
:param text:
:param headers:
:return:
"""
html = etree.HTML(text)
asin = html.xpath('//div[@class="s-main-slot s-result-list s-search-results sg-row"]//div[@data-component-type="s-search-result"][5]/@data-asin')
logger.info(f"获取asin: {asin}")
if len(asin) == 0:
logger.error(f"没有asin")
return []
url = f"https://www.amazon.{site}/dp/{asin[0]}"
text = Request.request_html(url, proxy, **{"headers": headers})
if text is None:
return []
text = etree.HTML(text)
best_seller_url = get_best_seller_url(text)
if len(best_seller_url) == 0:
logger.error(f"没有畅销榜")
return []
result = []
for i in range(1, 3):
for _ in range(3):
try:
url = f"https://www.amazon.{site}{best_seller_url}?pg={i}"
logger.info(f"开始采集: {url}")
text = Request.request_html(url, proxy, **{"headers": headers, 'is_check_postal': False})
if text is None:
continue
response = amazon_best_sellers(text)
result += response
break
except Exception as e:
logger.error(f"获取畅销榜: {e}")
return result
def style_snap_upload(url: str = None, style_snap_token: str = None, style_snap_cookie: str = None, site: str = "com"):
"""
style snap 上传
"""
file_path = None
name = str(uuid.uuid1())
for i in range(3):
try:
logger.info(f"开始下载: {url}")
file_path = f"image/{name}.png"
if not os.path.exists("image"):
os.makedirs("image")
if IS_DEBUG:
file_path = "image/1.jpg"
else:
with open(file_path, 'wb') as f:
f.write(requests.get(url, timeout=15, proxies={"http": "http://127.0.0.1:7890", "https": "http://127.0.0.1:7890"}).content)
logger.info(f"上传图片: {file_path}")
if os.path.exists(file_path):
break
except:
logger.error(f"下载失败: {url}")
if not os.path.exists(file_path):
logger.error(f"图片下载失败")
return []
# 上传图片
url = f"https://www.amazon.{site}/stylesnap/upload?stylesnapToken={style_snap_token}&language=en_US"
payload = {}
headers = {
'origin': f'https://www.amazon.{site}',
'referer': f'https://www.amazon.{site}/stylesnap',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0',
'Cookie': style_snap_cookie
}
result = []
for i in range(5):
# _proxy = Search(redis_singleton).get_proxy()
_proxy = {
"proxy": "127.0.0.1:7890"
}
try:
file_value = open(file_path, 'rb').read()
files = [
('explore-looks.jpg', (f"{name}.png", file_value, 'image/png'))
]
proxies = {
"https": f"http://{_proxy['proxy']}",
"http": f"http://{_proxy['proxy']}"
}
response = _requests.request("POST", url, headers=headers, data=payload, files=files, proxies=proxies, timeout=15).json()
search_result = response['searchResults']
if len(search_result) == 0:
logger.error(f"没有搜索结果")
break
search_result = search_result[0]
asin_data_list = search_result['bbxAsinMetadataList']
asin_list = []
for asin_data in asin_data_list:
variations_asin = asin_data.get('twisterVariations', None)
asin = asin_data.get('asin', None)
if asin in asin_list:
continue
if variations_asin:
for variations in variations_asin:
temp_asin = variations.get('asin', None)
asin_list.append(temp_asin)
new_delivery = [
"FREE delivery",
"Tomorrow",
"on $35 of items shipped by Amazon",
"Or fastest delivery",
"Tomorrow"
]
price = asin_data['price']
if not price:
continue
result.append({
"title": asin_data['title'],
"price": Fmt.parse_price(asin_data['price']),
"delivery": new_delivery,
"review_count": 1000,
"unique_value": asin,
"image": asin_data['imageUrl'],
"is_prime": "",
"star_level": asin_data['averageOverallRating'],
})
break
except Exception as e:
logger.error(f"图片采集失败 - {e}: {url}")
if not IS_DEBUG:
# 删除图片
if os.path.exists(file_path):
os.remove(file_path)
return result
class Search:
site = Site.com
postcode = Postcode.com
lang = Lang.com
def __init__(self, singleton):
self.task_key = task_config['task_key']
self.item_key = task_config['item_key']
self.task_number = int(task_config['task_number'])
self.enabled = task_config['enabled'] == 'True'
self.is_debug = task_config['is_debug'] == 'True'
self.request_timeout = int(task_config['request_timeout'])
self.proxy_manager = ProxyManager(singleton)
self.task_manager = Task(singleton)
self.redis_singleton = singleton
def get_proxy(self):
"""
:return:
"""
if self.is_debug:
test_proxy = '127.0.0.1:7890'
proxy = '#1#1127.0.0.1:7890'
else:
proxy = self.proxy_manager.get_proxy()
if proxy is None:
return None
test_proxy = proxy.split('#')[0]
self.proxy_manager.join_proxy(proxy)
return {
"proxy": f"chensav:chensav@{test_proxy}",
"temp_proxy": proxy,
}
def join_proxy(self, proxy):
"""
加入代理
:param proxy:
:return:
"""
return self.proxy_manager.join_proxy(proxy)
def delete_proxy(self, proxy):
"""
删除代理
:param proxy:
:return:
"""
proxy_name = proxy.split('#')[1]
return self.proxy_manager.delete_proxy(proxy_name)
def run(self, task):
"""
:param task:
:return:
"""
platform_type = task.get('platform_type', TASK_TYPE_AMAZON)
task_id = task.get('task_id', None)
url = task.get('url', None)
best_headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0',
'cookie': 'session-id=136-2468048-6163604; i18n-prefs=USD; ubid-main=131-7499109-7807818; lc-main=en_US; at-main=Atza|IwEBIHa4KWiWSA9huPzuYcto-Dyv2TWYsxt8i0rHrAI-L9kYt0P_jCwtUCpDvUllUEZ74WXcHi204jLL1qfO9SBHTKfFjEziPUCOCwP9roNMCksnyQetSFpg69YT5DLhf1Vr86ZwyN73Vgtf7EfK1ueK4a0gfYoI8JgU8zge2imDPkCnb1uDUSl5XYZNMrM9J7ts2EeHWezk6nFS3eryaZGsZWZ7xqirdQEclPpy9tWA4AI_WA; sess-at-main="59aD29gLm8BIx53eiZ2iP7OZqPLxMfMasooC3gLTmXs="; sst-main=Sst1|PQEXRfFgggmDmG6xvvAYNCv8CZ0Z1ruNYByvOZHlvBEVZ4IXM8_bmJr7cYQzAWmZcsUaXzI4fklYPmODP0HZbmwOcRUEH9D5SlKkX7IKwMc7BMMOnqpFV4-eI1InxdYfVmdEav3wINq_hx1NRKkzWZJQXdBha68LufT46idaBnM5LkqeBlRSVjeBoQbAzUCVfiVF5FGBrbWGrs8utvYkeiabeE7G5sb9MLWXd-Ws62ARZxW233KaBqCRyWHyAh-Ccd7rgXavPZx-3fGGhjoewlTAIxitgXnKG5N9V-r4YM63iPU; skin=noskin; session-id-time=2082787201l; JSESSIONID=85EDBB6AD9DC6EBA1FD75B9685F69B8A; session-token=NLgyCO4UJzgl6fbZiJ+VZEetV9l53enC7fD9t0GeUIghtQTPiVzp6povgyFBcvFvh1vw42bR6pxJCxWeJlHqtlQr/qqtnxiA4cT/fzYJbWuFHx0Tn334NSA4jUNU2sSpwkJt8TIxDLkWhmYEEbcyKXQGfHeGhiXD5Qdw5s2/B0LNmFm6MtKXbqIoVt3YfYXEMUIYVhR1/2KPuaI8jVi/kXm/3S0v/n+bumP3CrNL7CJ9pEfB+aXzLlq4YdTT+AuZQdy/0FCZ8Kp7D83KllVEaTGt1y6ZBt60LLV70+Ddq31pYJV4U3jQfymBr/21vZyHBHX/79DiQlvMVIWJ5VKMrfFv93X5Wjp+; csm-hit=tb:JVN1CY2SVM9RA007Q9SA+s-J1W62F8TJMXRFMEH3QVB|1730453608843&t:1730453608843&adb:adblk_yes'
}
# 最大重试次数
retry = task.get('retry', 0)
if retry > 10:
logger.error('重试次数过多')
return
_proxy = self.get_proxy()
if _proxy is None:
logger.error('代理获取失败,重新入队')
self.task_manager.add_task(task, self.task_key)
return
result = None
new_url = urllib.parse.quote(url, safe=':/&=?')
request_html = functools.partial(Request.request_html, postcode=self.postcode)
try:
if "Best-Sellers" in url:
logger.info(f"Best Sellers: {url}")
text = request_html(new_url, _proxy['proxy'], **{"headers": best_headers, "is_check_postal": False})
result = amazon_best_sellers(text, site=self.site)
elif "/dp/" in url:
logger.info(f"商品详情: {url}")
asin = get_url_asin(url)
if asin:
result = amazon_detail(asin, _proxy['proxy'], site=self.site, **{"headers": best_headers, "is_check_postal": False})
elif "tiktok_platform" in url:
logger.info(f"分类搜索: {url}")
headers = self.task_manager.get_loca_cookie(self.site)
text = request_html(new_url, _proxy['proxy'], **{"headers": headers})
result = category_search(new_url, text, site=self.site, **{"proxy": _proxy['proxy'], "headers": headers})
elif "product_best_sellers" in url:
logger.info(f"产品 Best Sellers: {url}")
text = request_html(new_url, _proxy['proxy'], **{"headers": best_headers})
result = product_best_sellers(text, _proxy['proxy'], best_headers, site=self.site)
elif "style_snap_upload" in url:
logger.info(f"以图搜图: {url}")
style_snap_token = task.get('style_snap_token', None)
style_snap_cookie = task.get('style_snap_cookie', None)
result = style_snap_upload(url, style_snap_token, style_snap_cookie, site=self.site)
else:
logger.info(f"亚马逊搜索: {url}")
headers = self.task_manager.get_loca_cookie(self.site)
text = request_html(new_url, _proxy['proxy'], **{"headers": headers})
result = amazon_search(text)
except Exception as e:
logger.error(f"获取数据失败: {e}")
self.task_manager.add_task(task, self.task_key)
if result:
data = {
"data": {
"task_id": task_id,
"collection_data": result,
'platform_type': platform_type,
"country": self.lang[-2:]
},
"type": task.get('callback_type'),
"admin_users_id": task.get('admin_users_id', 0),
"app_name": task.get('app_name', 'admin'),
"project": task.get('project', 'tiktok'),
}
if self.is_debug:
logger.info(f"回调数据: {json.dumps(data)}")
self.task_manager.callback_task(self.item_key, data)
return data
# if self.is_debug:
# sys.exit(0)
def main(self):
"""
:return:
"""
if not self.is_debug:
proxy_number = self.proxy_manager.get_proxy_number()
if proxy_number <= 1:
logger.info('代理数量不足')
sleep(3)
return
if not self.enabled:
logger.info('搜索任务未开启')
return
if self.is_debug:
tasks = [
'{"url":"https://www.amazon.com/s?k=Adhesive+Shower+Caddy&i=garden&page=2&crid=1MR2IOB4BMAXF&qid=1739174661&sprefix=adhesive+shower+caddy%2Cgarden%2C391&xpid=--1JOuYBktM8-&ref=sr_pg_2","admin_users_id":1,"task_id":37,"collection_number":0,"callback_type":8,"app_name":"admin","platform_type":6, "style_snap_token": "hE93khRzOa%2FbMswfn3xe9TAXTr9KNux4Mdi5XAiKFHd%2FAAAAAGd8344AAAAB", "style_snap_cookie": "csm-hit=tb:s-VXZENW8D1NAY9XYEPHZF|1736236940971&t:1736236941041&adb:adblk_no; ubid-main=132-3082826-1910101; session-id-time=2082787201l; lc-main=zh_CN; session-token=UeNF8HVHFLqfKqC7TdegbxLKHHtTCq6+w5zpa+0jEhd1e6mVqdYw0hvx7OrgKYaJuhR/kaf6iQrh/hLtY62NjNZ5rFlmumhqA6t2EvY2/+kvfs/1WBMMxWgQ5zUwUfHda+BxlFo1VJHUllQvwBlSfAL9rs+x5e1X4QXKbsYphIJ+w6vTq6g7eyLNp31mfaSOAh3uMkf6ydZ2kXtMJKClJ/0KKUVHcfKn9cmslh7sOI5dwhrXMwFRE++ELWFc0mSXj6OK+z0hrHD5vvMC0idZmpCBMQ+GKrtAOGZ1o3rXU+iTwWaYekdb14ZYcPxQas8Q1g8MgN82WenDAxCwCav6YT0qsOTou4My; i18n-prefs=USD; session-id=130-1134194-2738868"}']
else:
tasks = self.task_manager.get_task(self.task_key, self.task_number)
if not tasks:
logger.info('任务队列为空')
sleep(5)
return
queue = []
start_time = time.time()
tasks = [json.loads(task) for task in tasks]
for task in tasks:
queue.append(self.run(task))
if queue:
asyncio.gather(*queue)
logger.info(f"任务耗时: {time.time() - start_time}")
if self.is_debug:
sys.exit(0)
import asyncio
import os
import platform
from loguru import logger
from spider.ada import Adapter
from spider.base_detail import Goods, redis_singleton
class ComGoods(Goods):
pass
class DeGoods(Goods, metaclass=Adapter):
pass
class ItGoods(Goods, metaclass=Adapter):
pass
class FrGoods(Goods, metaclass=Adapter):
pass
class EsGoods(Goods, metaclass=Adapter):
pass
class JpGoods(Goods, metaclass=Adapter):
pass
if __name__ == "__main__":
if platform.system() == "Windows":
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
PID_FILE = "./pid/product_detail.pid"
for _ in range(200):
# 判断是否存在pid文件
if not os.path.exists(PID_FILE):
logger.error("任务退出")
break
goods = Goods(redis_singleton)
asyncio.run(goods.main())
import asyncio
import os
import platform
from loguru import logger
from spider.ada import Adapter
from spider.base_monitor import Monitoring, redis_singleton
class ComMonitoring(Monitoring):
pass
class DeMonitoring(Monitoring, metaclass=Adapter):
pass
class ItMonitoring(Monitoring, metaclass=Adapter):
pass
class FrMonitoring(Monitoring, metaclass=Adapter):
pass
class EsMonitoring(Monitoring, metaclass=Adapter):
pass
class JpMonitoring(Monitoring, metaclass=Adapter):
pass
if __name__ == "__main__":
if platform.system() == "Windows":
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
PID_FILE = "./pid/monitoring.pid"
for _ in range(100):
# 判断是否存在pid文件
if not os.path.exists(PID_FILE):
logger.error("任务退出")
break
monitoring = Monitoring(redis_singleton)
asyncio.run(monitoring.main())
import asyncio
import os
import platform
from loguru import logger
from spider.ada import Adapter
from spider.base_search import Search, redis_singleton
class ComSearch(Search):
pass
class DeSearch(Search, metaclass=Adapter):
pass
class ItSearch(Search, metaclass=Adapter):
pass
class FrSearch(Search, metaclass=Adapter):
pass
class EsSearch(Search, metaclass=Adapter):
pass
class JpSearch(Search, metaclass=Adapter):
pass
if __name__ == "__main__":
if platform.system() == "Windows":
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
PID_FILE = "./pid/search.pid"
for _ in range(100):
if not os.path.exists(PID_FILE):
logger.info("任务退出")
break
search = Search(redis_singleton)
asyncio.run(search.main())
#!/bin/bash
# 保存PID
PID_DIR="./pid"
mkdir -p $PID_DIR
# 保存LOG
LOG_DIR="./log"
mkdir -p $LOG_DIR
# 启动Worker
nohup celery -A celery_app worker -Q spider -l info -E > $LOG_DIR/worker.log 2>&1 &
echo $! > $PID_DIR/worker.pid
# 启动Beat
nohup celery -A celery_app beat -l info > $LOG_DIR/beat.log 2>&1 &
echo $! > $PID_DIR/beat.pid
# 启动Flower
#nohup celery -A celery_app flower --port=5555 --address=127.0.0.1 > $LOG_DIR/flower.log 2>&1 &
#echo $! > $PID_DIR/flower.pid
echo "服务已启动 | Worker PID:$(cat $PID_DIR/worker.pid) | Beat PID:$(cat $PID_DIR/beat.pid) | Flower PID:$(cat $PID_DIR/flower.pid)"
#!/bin/bash
PID_DIR="./pid"
kill -9 $(cat $PID_DIR/*.pid) 2>/dev/null
rm -f $PID_DIR/*.pid
echo "服务已停止"
[program:celery-app-detail]
command = celery -A celery_app worker -Q detail -n spider_detail_worker@%%h -l info --concurrency=40 -E
directory = /root/amazon-mult-site
autorestart = true
startsecs = 10
startretries = 10
stdout_logfile = /etc/supervisord.d/log/celery_app_spider.log
stderr_logfile = /etc/supervisord.d/log/celery_app_spider.log
stdout_logfile_maxbytes = 2MB
stderr_logfile_maxbytes = 2MB
user = root
priority = 999
numprocs = 1
process_name = %(process_num)02d
[program:celery-app-search]
command = celery -A celery_app worker -Q search -n spider_search_worker@%%h -l info --concurrency=20 -E
directory = /root/amazon-mult-site
autorestart = true
startsecs = 10
startretries = 10
stdout_logfile = /etc/supervisord.d/log/celery_app_spider.log
stderr_logfile = /etc/supervisord.d/log/celery_app_spider.log
stdout_logfile_maxbytes = 2MB
stderr_logfile_maxbytes = 2MB
user = root
priority = 999
numprocs = 1
process_name = %(process_num)02d
[program:celery-app-dial]
command = celery -A celery_app worker -Q dial -n dial_worker@%%h -l info --concurrency=3 -E
directory = /root/amazon-mult-site
autorestart = true
startsecs = 10
startretries = 10
stdout_logfile = /etc/supervisord.d/log/celery_app_dial.log
stderr_logfile = /etc/supervisord.d/log/celery_app_dial.log
stdout_logfile_maxbytes = 2MB
stderr_logfile_maxbytes = 2MB
user = root
priority = 999
numprocs = 1
process_name = %(process_num)02d
[program:celery-beat]
command = celery -A celery_app beat -l info
directory = /root/amazon-mult-site
autorestart = true
startsecs = 10
startretries = 10
stdout_logfile = /etc/supervisord.d/log/celery_beat.log
stderr_logfile = /etc/supervisord.d/log/celery_beat.log
stdout_logfile_maxbytes = 2MB
stderr_logfile_maxbytes = 2MB
user = root
priority = 999
numprocs = 1
process_name = %(process_num)02d
[program:celery-flower]
command = celery -A celery_app flower --port=5555 --address=0.0.0.0 --basic_auth=chensav:chensav
directory = /root/amazon-mult-site
autorestart = true
startsecs = 10
startretries = 10
stdout_logfile = /etc/supervisord.d/log/celery_flower.log
stderr_logfile = /etc/supervisord.d/log/celery_flower.log
stdout_logfile_maxbytes = 2MB
stderr_logfile_maxbytes = 2MB
user = root
priority = 999
numprocs = 1
process_name = %(process_num)02d
[program:task]
command = /usr/bin/python3 task.py
directory = /root/amazon-mult-site
autorestart = true
startsecs = 10
startretries = 10
stdout_logfile = /etc/supervisord.d/log/task.log
stderr_logfile = /etc/supervisord.d/log/task.log
stdout_logfile_maxbytes = 2MB
stderr_logfile_maxbytes = 2MB
user = root
priority = 999
numprocs = 1
process_name = %(process_num)02d
[program:callback]
command = /usr/bin/python3 callback.py
directory = /root/amazon-mult-site
autorestart = true
startsecs = 10
startretries = 10
stdout_logfile = /etc/supervisord.d/log/callback.log
stderr_logfile = /etc/supervisord.d/log/callback.log
stdout_logfile_maxbytes = 2MB
stderr_logfile_maxbytes = 2MB
user = root
priority = 999
numprocs = 1
process_name = %(process_num)02d
from time import sleep
import asyncio
import json
import os
import threading
import time
import redis
from curl_cffi import requests
from loguru import logger
from conf import config
from const import Site
from db import RedisSingleton
from tool import Task
COLL_DOMAIN = config['app']['coll_domain']
_redis_db = redis.Redis.from_url(config['redis']['url'], decode_responses=True)
task_monitoring_config = config['task-monitoring']
task_search_config = config['task-search']
task_product_detail_config = config['task-product-detail']
redis_config = config['redis']
cookie_config = config['cookie']
DEFAULT_HEADER = {
"Content-Type": "application/json",
"Accept": "application/json",
}
PID_FILE = './pid/task.pid'
def get_task(task_key: str = task_monitoring_config['queue_key'], number: int = 1):
"""
获取任务
:param task_key:
:param number:
:return:
"""
try:
url = f"{COLL_DOMAIN}/api/collection/task?number={number}&queue={task_key}"
response = requests.get(url, headers=DEFAULT_HEADER, verify=False)
response = response.json()
if response["code"] == 0:
return response["data"]
else:
return {}
except Exception as e:
logger.error(f"获取任务异常 : {e}")
return {}
def add_task(task_key: str, redis_key: str, task_number: int):
"""
添加任务
:param task_key:
:param redis_key:
:param task_number:
:return:
"""
items = get_task(task_key, task_number)
task_lists = items.get('list', [])
if task_lists:
logger.info(f"{task_key} - {len(task_lists)}个任务加入队列")
for item in task_lists:
_redis_db.lpush(redis_key, json.dumps(item))
def run(task_config: dict = task_monitoring_config):
while True:
if not os.path.exists(PID_FILE):
logger.error('任务退出')
break
add_task_enabled = task_config.get('add_task_enabled')
task_number = int(task_config.get('number'))
task_min_number = int(task_config.get('min_number'))
redis_key = task_config.get('task_key')
todo_task_count = _redis_db.llen(redis_key)
task_key = task_config.get('queue_key')
try:
task_keys = task_key.split(',')
logger.info(f"{redis_key}任务队列长度 : {todo_task_count}")
if todo_task_count <= task_min_number and add_task_enabled == 'True':
for key in task_keys:
add_task(key, redis_key, task_number)
except KeyboardInterrupt:
logger.error(f"任务终止")
except Exception as e:
logger.error(f"任务异常 : {redis_key} : {e}")
time.sleep(5)
def cookie():
for site in Site.values():
time_key = cookie_config['cookie_time_key']
time_key = f"{time_key}:{site}"
_redis_db.delete(time_key)
while True:
if not os.path.exists(PID_FILE):
logger.error('任务退出')
break
logger.info(f"获取cookie")
for site in Site.values():
try:
task_manager.get_cookie(site)
except:
logger.error(f"获取cookie异常")
sleep(5)
if __name__ == '__main__':
tasks = []
redis_singleton = RedisSingleton(redis_url=redis_config['url'])
task_manager = Task(redis_singleton)
with open(PID_FILE, 'w') as f:
f.write(str(os.getpid()))
if task_monitoring_config.get('enabled', None) == 'True':
logger.info(f"监控任务添加")
t = threading.Thread(target=run, args=(task_monitoring_config,))
tasks.append(t)
else:
logger.info(f"监控任务未启动")
if task_product_detail_config.get('enabled', None) == 'True':
logger.info(f"发布任务添加")
t = threading.Thread(target=run, args=(task_product_detail_config,))
tasks.append(t)
else:
logger.info(f"发布任务未启动")
if task_search_config.get('enabled', None) == 'True':
logger.info(f"搜索任务添加")
t = threading.Thread(target=run, args=(task_search_config,))
tasks.append(t)
else:
logger.info(f"搜索任务未启动")
t = threading.Thread(target=asyncio.run, args=(cookie(),))
tasks.append(t)
for t in tasks:
t.start()
for t in tasks:
t.join()
import json
import pytest
from celery import Celery
from celery.contrib.testing.worker import start_worker
from celery_tasks import assign_spider_task, start_spider_task
from const import Site
@pytest.fixture(scope="session")
def celery_worker():
app = Celery(
broker_url="memory://",
result_backend="cache+memory://",
task_always_eager=False,
)
with start_worker(
app,
shutdown_timeout=600,
perform_ping_check=False,
# loglevel="DEBUG",
) as worker:
yield worker
def test_assign_task(celery_worker):
asin = "B0F4K4NQM2"
sites = [Site.com]
# sites = Site.values()
for site in sites:
style_snap_token = {
Site.de: "hOCRqU2moPvF7OHbdVIA%2FcA5ovfB6r0ylJuUi%2FlOBZz3AAAAAGgkOxkAAAAB",
Site.it: "hDnqRGV%2FHNIvmt066sCeQ%2BCV3nSmUqKeaz8xxb%2FvRxVcAAAAAGgkSZgAAAAB",
Site.fr: "hOszNRVKeyUBhfw3wv5b0aA8oeLE2aeIwsz2PUdA298eAAAAAGgkYs0AAAAB",
Site.es: "hBrvHdT5rdf55ogsaVpOntP1EBtw%2FzMQ9EjExhl%2BfaEpAAAAAGgq2CgAAAAB",
Site.jp: "hLIxmKRriLNZAn1JzZapY3HvPn42ymFTSMsDTQg5HAfBAAAAAGgkcnQAAAAB",
}
style_snap_cookie = {
Site.de: "session-id=262-3255866-8161667; i18n-prefs=EUR; lc-acbde=de_DE; ubid-acbde=260-9608208-7094621; session-id-time=2082787201l; session-token=o/NZjNyInsXiiUgIboxXoXSRrYwl3rx9fAymAp4CCwf6WdcuRNAUQCEFg47k8VXTC3KSYw6fBMBgM6tXUIwbHjsorhAdhwCurKbypoC3lgoZMkf00U5I1kSmNqc3HLpjse4Ymn69X/0QdC85JsfK71ik4AxLCvPas/xOzW4eqxBAyX+pV3wApnfeZnIphELnlFDSr9mrQ4OK0sJaveHFbCQyPsCsOqEbgqokVxqZuo1PHbbNtk8kmY5+O4FGZZ1Rd29/WpI325eY46Khf3mKdQsUvmHJ9Qdjti0OHBQXqG/2Uas2LyCi0cKyybMJPn7LqzFS0ro6MZ3ufRXA2ZbdRlzCu/imptOn; rxc=AM5BPAyBERNPU5AAzsA; csm-hit=tb:s-60X7Y7EP7NMPAY5A4EB8|1747204893470&t:1747204894017&adb:adblk_no",
Site.it: "session-id=259-0276938-7980111; i18n-prefs=EUR; ubid-acbit=261-6163436-4078364; session-id-time=2082787201l; session-token=E/w0330BjSmKZBECYTfHbZ7yEy7I4AKBdWL/m8ZX+tL2cAR7kPoLrFNx8S5dx16pTkHdMKW09HLjWeQeNz7llVEHqn172fHwxDcS7pazkXJgNRSRLDsLKfhf37omIg1wiBFLLlF/Ze/7Y9UHiodrU+0Ii0BZB6fXmpuia1MfYZVIhc0tFhAo8YFtCB9tyGZo0HTV11uIxxlm7fWf8o750ibLu1P2XupDWM1XEWpTkioZfgwU/HDDA7VO3nteiy42ajAiVqg+Mp6qA4qnaem9nNx+7TQwOEfhY1QvoMVtrfiZydc/m5wAij9/Cy8vfkSSxrtRXiwMwMI2lkzTSyfIzd0KLLJ+VdrT; csm-hit=tb:s-WXX22RF20QVB7J6QY6GK|1747208604822&t:1747208605107&adb:adblk_no",
Site.fr: "session-id=259-3895263-6553629; session-id-time=2082787201l; i18n-prefs=EUR; ubid-acbfr=259-4408157-0120414; session-token=eUYnvkQ5sAKaXqveut1y8oduGNiOD2EjeL69ox8B8WMis+WxwXvxOxl6YG3/Vbmxua74nu6Po4SqX7jb2zWEN0MafCL2vNWuqH8EtzWGxf7b956vhl+Zke6znbzQIGc0TJvEHQxYH4qhU2nhjMewvfEKaNpLin+BlSteJ/29YPTAmJoDmn1SBQFmLS1F+lwvJslCVl2tE5wPQtlRJRoy1J3Y3PyilY/QeSeOoQm0kspCKVwiIZ0Rker6IoiJ994xQp9mBMPgf7Kaxq+3/4WcJH0gUgDMTEJ422JCvVyUsPlIqoB87OnL9svvThxe2rw3+G3Pl7RGcfsrgTNxMLajeryo0c4gDsCA; csm-hit=tb:s-MVXSWSB18MKR7M8ZGZPG|1747214963697&t:1747214964546&adb:adblk_no",
Site.es: "session-id=257-1877650-6628461; session-id-time=2082787201l; i18n-prefs=EUR; ubid-acbes=257-0235449-8488852; session-token=4VefymsDENsTHhmV3VThQL9xtbG25qqF0ue7hTQAuej9Hwcb8gteCVV4dDQ8BzXeSdcE1qxvGNxI5sjDddBC4GwnfXxNjPW4KKREHTXl4AcOnIBB0NsAP8ysdlrGmnF2i6tZaPQQmujRiQkE6W6541cqvo2awOmApuIAzLb/yTYBIwJdC7DAbRSAUjBpCb/Par21qrO4i6Vz/yzK9jvAZdmWNIHE6aFN6P0gNMnz8ubsQwqHzEhCqDZDzd2/82dB7ZkztnblwWdLTf11HIWSVcwY0Yrt/3Dva64m73AGG9ehiXhZVZAo9nBai5dRi1Bw21HZm9LicAtLlbc51P5nskJuXBVBUqHT; csm-hit=tb:s-6TH1ZWPZ3MSWA7C2FNME|1747638315404&t:1747638315630&adb:adblk_no",
Site.jp: "session-id=356-1153481-7631550; session-id-time=2082787201l; i18n-prefs=JPY; lc-acbjp=zh_CN; ubid-acbjp=356-3715403-6686010; session-token=\"UPpZlDoDBS1AIP3L68R2R+yjb0KoWvefyp2stmNqnQdqPJSW03v3MGhH9DhlM1a14QVzBoJY6/sys0OcBcFD+edCQDl9W1FPUeMbExBr4xxrsaPHcUWQ/pciyQ6+tXq1A4mn42SoZgYU1qf9Sz3P2+gz5NPfj8R82NFt5oL3s61DpWWDeLLXLdWIzHHyAq6Szi19aVK9GnhDuMu67f/vvi/ecgaeRgFumFtFmOCjAD3zTkumKFvJnKyEEHUzS3lye1TOx/bB6mEtPVHxlo3R4XIgeeDQeaAkjLot9ksW9GzRhe9xtdUHTPHRI9P8oKy2PMWyrqu7yH5AhIj0uYC/XUtYoom6Z85Qr5J8RxoeQnY=\"; csm-hit=tb:s-VE4YKH0HM7B9R7F0G6GC|1747219065639&t:1747219065883&adb:adblk_no",
}
url1 = f"https://www.amazon.{site}/dp/{asin}?th=1"
key1 = "task-product-detail"
url2 = f"https://www.amazon.{site}/dp/{asin}"
key2 = "task-monitoring"
key3 = "task-search"
task1 = json.dumps(
{
"url": f"https://www.amazon.{site}/s?k=Adhesive+Shower+Caddy&page=2&xpid=oznV7KayXAf7P&qid=1747215325&ref=sr_pg_2",
# "url": f"https://www.amazon.{site}/s?k=Adhesive+Shower+Caddy&i=garden&page=2&crid=1MR2IOB4BMAXF&qid=1739174661&sprefix=adhesive+shower+caddy%2Cgarden%2C391&xpid=--1JOuYBktM8-&ref=sr_pg_2",
"admin_users_id": 1,
"task_id": 37,
"collection_number": 0,
"callback_type": 8,
"app_name": "admin",
"platform_type": 6,
}
)
task2 = json.dumps(
{
"url": f"https://www.amazon.{site}/+style_snap_upload", # 虚构路径
"admin_users_id": 1,
"task_id": 37,
"collection_number": 0,
"callback_type": 8,
"app_name": "admin",
"platform_type": 6,
"style_snap_token": style_snap_token.get(site),
"style_snap_cookie": style_snap_cookie.get(site),
}
)
all_tasks = [
['{"id": 252, "admin_users_id": 1, "log_id": 11, "platform_type": 1, "callback_type": 9, "app_name": "admin", "max_sku_number": 35, "project": "temu", "url": "%s"}'%url1],
['{"url":"%s","item_id":"27","admin_users_id":1,"callback_type":5,"collection_type":1,"app_name":"admin","cache_time":60}'%url2],
[task1], [task2]
]
keys = [key1, key2, key3, key3]
items = list(zip(keys, all_tasks))
items = [items[1]]
for key, tasks in items:
# tasks = None
assign_spider_task.apply_async(args=(key, tasks,)).get()
def test_start_task(celery_worker):
# start_dial_task.delay().get()
start_spider_task.delay().get()
if __name__ == "__main__":
# pytest.main(["./"])
# pytest.main(["test/test_celery.py::test_start_task"])
pytest.main(["test/test_celery.py::test_assign_task"])
import json
import pytest
import redis
from conf import config
from const import Site, Spider
DB_REDIS = redis.Redis.from_url(config["redis"]["url"], decode_responses=True)
def test_spider_task():
asin = "B0CMTQFXB8"
for _ in range(1):
DB_REDIS.lpush(
config[Spider.detail]["task_key"],
# json.dumps({"url": f"https://www.amazon.{Site.de}/dp/{asin}"}),
json.dumps({'id': 2061071844, 'admin_users_id': 0, 'log_id': 8252798, 'platform_type': 6, 'callback_type': 9, 'app_name': 'admin', 'max_sku_number': 35, 'url': 'https://www.amazon.de/dp/B01MYBVW76', 'project': 'tiktok'})
)
if __name__ == "__main__":
pytest.main(["test/test_task.py::test_spider_task"])
from __future__ import annotations
import json
import random
import re
from datetime import datetime, timedelta
from babel.dates import get_month_names, get_day_names
from curl_cffi import requests
from curl_cffi.requests import AsyncSession
from dateutil import parser
from dateutil.relativedelta import relativedelta
from loguru import logger
from lxml import etree
from conf import config
from const import Lang, StockStatus
from const import SiteType
DOMAIN = config["app"]["domain"]
COOKIE = config["cookie"]
IMPERSONATE = config["app"]["impersonate"]
class Task:
def __init__(self, redis_singleton):
self.redis_singleton = redis_singleton
def get_task(self, task_key: str = "", batch_size: int = 10):
"""
获取任务
:param task_key:
:param batch_size:
:return:
"""
redis_client = self.redis_singleton.get_connection()
with redis_client.pipeline() as pipe:
pipe.lrange(task_key, 0, batch_size - 1)
pipe.ltrim(task_key, batch_size, -1)
datas, _ = pipe.execute()
return datas
def get_task_number(self, task_key: str = ""):
"""
获取任务数量
:param task_key:
:return:
"""
redis_client = self.redis_singleton.get_connection()
number = redis_client.llen(task_key)
return number
def add_task(self, task: dict, task_key: str):
"""
添加任务
:param task:
:param task_key:
:return:
"""
redis_client = self.redis_singleton.get_connection()
task["retry"] = task.get("retry", 0) + 1
if task_key == "task:amazon:monitoring":
only = [
"url",
"item_id",
"admin_users_id",
"callback_type",
"collection_type",
"app_name",
"step_value",
"is_first",
"retry",
"cache_time",
]
task = {key: task[key] for key in only if key in task}
redis_client.lpush(task_key, json.dumps(task))
def callback_task(self, callback_key: str, result: dict):
"""
回调任务
:param callback_key:
:param result:
:return:
"""
redis = self.redis_singleton.get_connection()
redis.lpush(callback_key, json.dumps(result))
def get_cookie(self, site: str = "com"):
"""
获取cookie
:return:
"""
type = SiteType.__dict__.get(site)
if type is None:
return None
cookie_url = f"{DOMAIN}/api/collection/get-cookie?type={type}"
redis_client = self.redis_singleton.get_connection()
time_key = COOKIE["cookie_time_key"]
time_key = f"{time_key}:{site}"
is_exists = redis_client.get(time_key)
if is_exists:
return None
response = requests.get(cookie_url).json()
cookie = response.get("data", [])
if cookie:
logger.info(f"获取cookie成功")
cookie = cookie[0]
new_cookie = {
"cookie": cookie.get("cookie", ""),
"user-agent": cookie.get("user-agent", ""),
}
redis_client.set(f"cookie:{site}", json.dumps(new_cookie))
redis_client.set(time_key, 1, ex=60)
else:
redis_client.delete(time_key)
return cookie
def get_loca_cookie(self, site: str = "com"):
"""
获取本地cookie
:return:
"""
redis_client = self.redis_singleton.get_connection()
cookie = redis_client.get(f"cookie:{site}")
if not cookie:
cookie = self.get_cookie(site)
if isinstance(cookie, dict):
return cookie
return json.loads(cookie)
class Request:
@staticmethod
def request_html(url: str = "", proxy: str = None, **kwargs: dict):
"""
请求html
:param proxy:
:param url:
:return:
"""
with AsyncSession(max_clients=50) as s:
# 清空 请求的值
s.headers.clear()
s.cookies.clear()
s.proxies.clear()
proxies = {
"https": proxy,
"http": proxy,
}
headers = kwargs.get("headers", {})
timeout = kwargs.get("timeout", 10)
postcode = kwargs.get("postcode", "20001")
headers["accept-language"] = (
"zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
)
is_check_postal = kwargs.get("is_check_postal", True)
response = s.get(
url,
headers=headers,
proxies=proxies,
timeout=timeout,
impersonate=IMPERSONATE,
)
if response.status_code != 200 and response.status_code != 404:
raise Exception(f"状态码错误: {response.status_code}, url:{url}")
text = response.text
html = etree.HTML(text)
captcha_image = html.xpath('//div[@class="a-row a-text-center"]/img/@src')
if len(captcha_image):
raise Exception("出现验证码")
if response.status_code == 200 and is_check_postal:
postal = html.xpath('//span[@id="glow-ingress-line2"]/text()')
postal = postal[0].strip() if len(postal) else ""
if not postal or postcode not in postal:
raise Exception("采集邮编错误")
is_product_detail = kwargs.get("is_product_detail", None)
is_link_error = html.xpath('//div[@id="g"]/a/@href')
title = Tool.get_title(html)
if len(is_link_error) == 0 and len(title) == 0 and is_product_detail:
raise Exception(f"采集内容有误")
return text
class Tool:
@staticmethod
def get_impersonate():
"""
获取伪装头
:return:
"""
impersonates = [
"chrome131",
"chrome124",
"chrome123",
"chrome120",
]
return random.choice(impersonates)
@staticmethod
def get_title(html):
"""
获取标题
:param html:
:return:
"""
title = html.xpath('//span[@id="productTitle"]/text()')
if len(title) == 0:
title = html.xpath('//span[@id="bond-title-desktop"]/text()')
return title
class Proxy:
@staticmethod
def get_zhan_proxies():
"""
获取代理
:return:
"""
proxies_url = f"{DOMAIN}/api/proxies/all?sign=ftd*kcm.ygh4mjp7ERJ"
proxies = requests.get(proxies_url).json()
ip = random.choice(proxies)
if not ip:
raise Exception(f"获取代理失败")
if "status" in ip:
return None
return ip
class Fmt:
@staticmethod
def _get_parserinfo(lang=Lang.com):
if lang == Lang.com:
return parser.parserinfo()
months = get_month_names("wide", locale=lang)
abbrev_months = get_month_names("abbreviated", locale=lang)
days = get_day_names("wide", locale=lang)
abbrev_days = get_day_names("abbreviated", locale=lang)
class TmpParserinfo(parser.parserinfo):
WEEKDAYS = [
(abbrev.lower(), full.lower())
for abbrev, full in zip(abbrev_days.values(), days.values())
]
MONTHS = [
(abbrev.lower(), full.lower())
for abbrev, full in zip(abbrev_months.values(), months.values())
]
return TmpParserinfo()
@staticmethod
def parse_date(string: str, lang=Lang.com):
"""解析日期
:param string:
:param lang:
:return:
"""
if not string:
return ""
elif "Today" in string:
dt = datetime.now()
elif "Tomorrow" in string:
dt = datetime.now() + timedelta(days=1)
else:
patt1 = re.compile(r"([\w\s]+)-([\w\s]+)")
patt2 = re.compile(r"(.*?)(\d+\D*-)(\D*\d+.*)")
if patt1.match(string):
string = patt1.match(string).group(2)
elif patt2.match(string):
string = patt2.sub(r"\1\3", string)
dt = parser.parse(string, parserinfo=Fmt._get_parserinfo(lang), fuzzy=True)
if dt.month < datetime.now().month:
dt = dt + relativedelta(years=1)
date = dt.strftime("%Y-%m-%d")
logger.debug(f"{string} -> {date}")
return date
@staticmethod
def parse_status(string: str, stock_status=StockStatus.com):
"""解析库存状态
:param string: _description_
:param stock_status: _description_, defaults to StockStatus.com
:return: _description_
"""
if not string:
return ""
string = string.strip().lower()
return "In Stock" if stock_status.lower() in string else "Only"
@staticmethod
def parse_price(string: str):
"""解析价格
:param string: _description_
:return: _description_
"""
if not string:
return float("nan")
return float(re.sub(r"[^\d.]", "", string))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment