Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
amazon-mult-site-sync
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
yexing
amazon-mult-site-sync
Commits
7e696e48
Commit
7e696e48
authored
Jun 16, 2025
by
yexing
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
u
parent
11e330a5
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
35 additions
and
10 deletions
+35
-10
spider/base_info.py
+35
-10
No files found.
spider/base_info.py
View file @
7e696e48
...
...
@@ -88,6 +88,14 @@ class Tool:
data_json
=
re
.
findall
(
"jQuery.parseJSON
\
('(.*)'
\
)"
,
text
)
data_json
=
data_json
[
0
]
.
replace
(
"
\\
'"
,
"'"
)
return
json
.
loads
(
data_json
)
@staticmethod
def
clean_text
(
text
):
"""
通用文本清理
"""
cleaned
=
re
.
sub
(
r'[\x00-\x1f\x7f-\x9f\u2000-\u200f]'
,
''
,
text
)
return
re
.
sub
(
r'[:\s]+'
,
' '
,
cleaned
)
.
strip
()
class
ProxyMixin
:
...
...
@@ -100,9 +108,9 @@ class ProxyMixin:
"""
if
self
.
is_debug
:
test_proxy
=
"127.0.0.1:7890"
proxy
=
"#1#2#127.0.0.1:7890"
proxy
=
"#1#2#127.0.0.1:7890"
else
:
proxy
=
self
.
proxy_manager
.
get_proxy
()
proxy
=
self
.
proxy_manager
.
get_proxy
()
if
proxy
is
None
:
return
None
test_proxy
=
proxy
.
split
(
"#"
)[
0
]
...
...
@@ -158,15 +166,30 @@ class InfoSpider(ProxyMixin):
ths
:
list
[
etree
.
_Element
]
=
html
.
xpath
(
'//*[@id="productDetails_detailBullets_sections1"]/tr/th'
)
tds
:
list
[
etree
.
_Element
]
=
html
.
xpath
(
'//*[@id="
productDetails_detailBullets_sections1"]/tr/td
'
span
:
list
[
etree
.
_Element
]
=
html
.
xpath
(
'//*[@id="
detailBullets_feature_div"]/ul/li/span/span
'
)
detail
=
{
th
.
text
.
strip
():
td
.
text
.
strip
()
for
th
,
td
in
zip
(
ths
,
tds
)}
product_dimensions
,
item_weight
=
""
,
""
if
ths
:
tds
:
list
[
etree
.
_Element
]
=
html
.
xpath
(
'//*[@id="productDetails_detailBullets_sections1"]/tr/td'
)
detail
=
{
th
.
text
.
strip
():
td
.
text
.
strip
()
for
th
,
td
in
zip
(
ths
,
tds
)}
product_dimensions
=
detail
.
get
(
"Product Dimensions"
,
""
)
item_weight
=
detail
.
get
(
"Item Weight"
,
""
)
elif
span
:
detail
=
dict
(
map
(
Tool
.
clean_text
,
(
span
[
i
]
.
text
.
strip
(),
span
[
i
+
1
]
.
text
.
strip
()))
for
i
in
range
(
0
,
len
(
span
),
2
)
)
package_dimensions
=
detail
.
get
(
"Package Dimensions"
,
""
)
.
split
(
"; "
)
if
len
(
package_dimensions
)
==
2
:
product_dimensions
,
item_weight
=
package_dimensions
free_delivery
=
Fmt
.
parse_date
(
free_delivery
[
0
])
if
len
(
free_delivery
)
else
""
return
{
"free_delivery"
:
free_delivery
,
"product_dimensions"
:
detail
.
get
(
"Product Dimensions"
,
""
)
,
"item_weight"
:
detail
.
get
(
"Item Weight"
,
""
)
,
"product_dimensions"
:
product_dimensions
,
"item_weight"
:
item_weight
,
}
@retry
(
...
...
@@ -179,13 +202,15 @@ class InfoSpider(ProxyMixin):
asin
=
Tool
.
get_url_asin
(
url
)
url
=
f
"https://www.amazon.{self.site}/dp/"
+
asin
+
"?th=1&psc=1"
_proxy
=
self
.
get_proxy
()
#
if IS_DEBUG:
# _proxy = {"proxy": None}
if
IS_DEBUG
:
logger
.
debug
(
url
)
if
_proxy
is
None
:
raise
ValueError
(
"没有代理"
)
try
:
headers
=
self
.
task_manager
.
get_loca_cookie
(
site
=
self
.
site
,
postcode
=
self
.
postcode
.
value
)
headers
=
self
.
task_manager
.
get_loca_cookie
(
site
=
self
.
site
,
postcode
=
self
.
postcode
.
value
)
text
=
Request
.
request_html
(
url
,
_proxy
[
"proxy"
],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment