Commit 7e696e48 by yexing

u

parent 11e330a5
...@@ -89,6 +89,14 @@ class Tool: ...@@ -89,6 +89,14 @@ class Tool:
data_json = data_json[0].replace("\\'", "'") data_json = data_json[0].replace("\\'", "'")
return json.loads(data_json) return json.loads(data_json)
@staticmethod
def clean_text(text):
"""
通用文本清理
"""
cleaned = re.sub(r'[\x00-\x1f\x7f-\x9f\u2000-\u200f]', '', text)
return re.sub(r'[:\s]+', ' ', cleaned).strip()
class ProxyMixin: class ProxyMixin:
proxy_manager = ProxyManager(REDIS) proxy_manager = ProxyManager(REDIS)
...@@ -158,15 +166,30 @@ class InfoSpider(ProxyMixin): ...@@ -158,15 +166,30 @@ class InfoSpider(ProxyMixin):
ths: list[etree._Element] = html.xpath( ths: list[etree._Element] = html.xpath(
'//*[@id="productDetails_detailBullets_sections1"]/tr/th' '//*[@id="productDetails_detailBullets_sections1"]/tr/th'
) )
span: list[etree._Element] = html.xpath(
'//*[@id="detailBullets_feature_div"]/ul/li/span/span'
)
product_dimensions, item_weight = "", ""
if ths:
tds: list[etree._Element] = html.xpath( tds: list[etree._Element] = html.xpath(
'//*[@id="productDetails_detailBullets_sections1"]/tr/td' '//*[@id="productDetails_detailBullets_sections1"]/tr/td'
) )
detail = {th.text.strip(): td.text.strip() for th, td in zip(ths, tds)} detail = {th.text.strip(): td.text.strip() for th, td in zip(ths, tds)}
product_dimensions = detail.get("Product Dimensions", "")
item_weight = detail.get("Item Weight", "")
elif span:
detail = dict(
map(Tool.clean_text, (span[i].text.strip(), span[i + 1].text.strip()))
for i in range(0, len(span), 2)
)
package_dimensions = detail.get("Package Dimensions", "").split("; ")
if len(package_dimensions) == 2:
product_dimensions, item_weight = package_dimensions
free_delivery = Fmt.parse_date(free_delivery[0]) if len(free_delivery) else "" free_delivery = Fmt.parse_date(free_delivery[0]) if len(free_delivery) else ""
return { return {
"free_delivery": free_delivery, "free_delivery": free_delivery,
"product_dimensions": detail.get("Product Dimensions", ""), "product_dimensions": product_dimensions,
"item_weight": detail.get("Item Weight", ""), "item_weight": item_weight,
} }
@retry( @retry(
...@@ -179,13 +202,15 @@ class InfoSpider(ProxyMixin): ...@@ -179,13 +202,15 @@ class InfoSpider(ProxyMixin):
asin = Tool.get_url_asin(url) asin = Tool.get_url_asin(url)
url = f"https://www.amazon.{self.site}/dp/" + asin + "?th=1&psc=1" url = f"https://www.amazon.{self.site}/dp/" + asin + "?th=1&psc=1"
_proxy = self.get_proxy() _proxy = self.get_proxy()
# if IS_DEBUG: if IS_DEBUG:
# _proxy = {"proxy": None} logger.debug(url)
if _proxy is None: if _proxy is None:
raise ValueError("没有代理") raise ValueError("没有代理")
try: try:
headers = self.task_manager.get_loca_cookie(site=self.site, postcode=self.postcode.value) headers = self.task_manager.get_loca_cookie(
site=self.site, postcode=self.postcode.value
)
text = Request.request_html( text = Request.request_html(
url, url,
_proxy["proxy"], _proxy["proxy"],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment