Commit 7e696e48 by yexing

u

parent 11e330a5
......@@ -88,6 +88,14 @@ class Tool:
data_json = re.findall("jQuery.parseJSON\('(.*)'\)", text)
data_json = data_json[0].replace("\\'", "'")
return json.loads(data_json)
@staticmethod
def clean_text(text):
"""
通用文本清理
"""
cleaned = re.sub(r'[\x00-\x1f\x7f-\x9f\u2000-\u200f]', '', text)
return re.sub(r'[:\s]+', ' ', cleaned).strip()
class ProxyMixin:
......@@ -100,9 +108,9 @@ class ProxyMixin:
"""
if self.is_debug:
test_proxy = "127.0.0.1:7890"
proxy = "#1#2#127.0.0.1:7890"
proxy = "#1#2#127.0.0.1:7890"
else:
proxy = self.proxy_manager.get_proxy()
proxy = self.proxy_manager.get_proxy()
if proxy is None:
return None
test_proxy = proxy.split("#")[0]
......@@ -158,15 +166,30 @@ class InfoSpider(ProxyMixin):
ths: list[etree._Element] = html.xpath(
'//*[@id="productDetails_detailBullets_sections1"]/tr/th'
)
tds: list[etree._Element] = html.xpath(
'//*[@id="productDetails_detailBullets_sections1"]/tr/td'
span: list[etree._Element] = html.xpath(
'//*[@id="detailBullets_feature_div"]/ul/li/span/span'
)
detail = {th.text.strip(): td.text.strip() for th, td in zip(ths, tds)}
product_dimensions, item_weight = "", ""
if ths:
tds: list[etree._Element] = html.xpath(
'//*[@id="productDetails_detailBullets_sections1"]/tr/td'
)
detail = {th.text.strip(): td.text.strip() for th, td in zip(ths, tds)}
product_dimensions = detail.get("Product Dimensions", "")
item_weight = detail.get("Item Weight", "")
elif span:
detail = dict(
map(Tool.clean_text, (span[i].text.strip(), span[i + 1].text.strip()))
for i in range(0, len(span), 2)
)
package_dimensions = detail.get("Package Dimensions", "").split("; ")
if len(package_dimensions) == 2:
product_dimensions, item_weight = package_dimensions
free_delivery = Fmt.parse_date(free_delivery[0]) if len(free_delivery) else ""
return {
"free_delivery": free_delivery,
"product_dimensions": detail.get("Product Dimensions", ""),
"item_weight": detail.get("Item Weight", ""),
"product_dimensions": product_dimensions,
"item_weight": item_weight,
}
@retry(
......@@ -179,13 +202,15 @@ class InfoSpider(ProxyMixin):
asin = Tool.get_url_asin(url)
url = f"https://www.amazon.{self.site}/dp/" + asin + "?th=1&psc=1"
_proxy = self.get_proxy()
# if IS_DEBUG:
# _proxy = {"proxy": None}
if IS_DEBUG:
logger.debug(url)
if _proxy is None:
raise ValueError("没有代理")
try:
headers = self.task_manager.get_loca_cookie(site=self.site, postcode=self.postcode.value)
headers = self.task_manager.get_loca_cookie(
site=self.site, postcode=self.postcode.value
)
text = Request.request_html(
url,
_proxy["proxy"],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment