Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
amazon-mult-site-sync
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
yexing
amazon-mult-site-sync
Commits
1fb7b8ca
Commit
1fb7b8ca
authored
Jun 13, 2025
by
yexing
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' of
http://124.223.61.50:8099/yexing/amazon-mult-site-sync
parents
767321a7
d260882e
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
564 additions
and
565 deletions
+564
-565
.gitignore
+10
-0
amazon_cookie.py
+256
-257
callback.py
+140
-140
celery_app.py
+1
-1
db.py
+1
-1
spider/base_detail.py
+1
-6
spider/base_monitor.py
+1
-2
spider/base_search.py
+0
-3
task.py
+152
-153
tool.py
+2
-2
No files found.
.gitignore
0 → 100644
View file @
1fb7b8ca
__pycache__
.vscode
celerybeat-*
.pytest_cache
log
pid
image
.idea
config
tmp
amazon_cookie.py
View file @
1fb7b8ca
from
time
import
sleep
import
asyncio
import
asyncio
import
json
import
json
import
random
import
random
import
re
import
re
import
time
import
time
import
traceback
import
traceback
from
typing
import
Self
from
typing
import
Self
import
requests
import
requests
from
DrissionPage
import
Chromium
from
DrissionPage
import
Chromium
from
DrissionPage.common
import
Settings
from
DrissionPage.common
import
Settings
from
DrissionPage.items
import
WebPageTab
from
DrissionPage.items
import
WebPageTab
from
loguru
import
logger
from
loguru
import
logger
from
redis.asyncio
import
from_url
from
redis.asyncio
import
from_url
class
Data
:
class
Data
:
@classmethod
@classmethod
def
items
(
cls
):
def
items
(
cls
):
return
(
return
(
(
k
,
v
)
(
k
,
v
)
for
k
,
v
in
cls
.
__dict__
.
items
()
for
k
,
v
in
cls
.
__dict__
.
items
()
if
isinstance
(
v
,
(
str
,
int
))
and
not
k
.
startswith
(
"__"
)
if
isinstance
(
v
,
(
str
,
int
))
and
not
k
.
startswith
(
"__"
)
)
)
@classmethod
@classmethod
def
values
(
cls
):
def
values
(
cls
):
return
(
v
for
_
,
v
in
cls
.
items
())
return
(
v
for
_
,
v
in
cls
.
items
())
@classmethod
@classmethod
def
zip
(
cls
,
other
:
Self
):
def
zip
(
cls
,
other
:
Self
):
return
((
v
,
getattr
(
other
,
k
))
for
k
,
v
in
cls
.
items
())
return
((
v
,
getattr
(
other
,
k
))
for
k
,
v
in
cls
.
items
())
@classmethod
@classmethod
def
inverse_dict
(
cls
):
def
inverse_dict
(
cls
):
return
{
v
:
k
for
k
,
v
in
cls
.
items
()}
return
{
v
:
k
for
k
,
v
in
cls
.
items
()}
class
Postcode
(
Data
):
class
Postcode
(
Data
):
com
=
"20001"
com
=
"20001"
de
=
"55545"
de
=
"55545"
it
=
"66040"
it
=
"66040"
fr
=
"75000"
fr
=
"75000"
es
=
"04810"
es
=
"04810"
jp
=
"496-0805"
jp
=
"496-0805"
class
Site
(
Data
):
class
Site
(
Data
):
com
=
"com"
com
=
"com"
# 德意法西日本
# 德意法西日本
de
=
"de"
de
=
"de"
it
=
"it"
it
=
"it"
fr
=
"fr"
fr
=
"fr"
es
=
"es"
es
=
"es"
jp
=
"co.jp"
jp
=
"co.jp"
IS_DEBUG
=
False
IS_DEBUG
=
False
DOMAIN
=
"https://20tools.net"
DOMAIN
=
"https://20tools.net"
redis_config
=
{
redis_config
=
{
"url"
:
"redis://:a123456,a@localhost:6379/10"
,
"url"
:
"redis://:a123456,a@localhost:6379/10"
,
"max_connections"
:
300
"max_connections"
:
300
}
}
cookie_config
=
{
cookie_config
=
{
"cookie_time_key"
:
"cookie_expired_time"
"cookie_time_key"
:
"cookie_expired_time"
}
}
UA
=
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
UA
=
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
Settings
.
set_raise_when_wait_failed
(
False
)
Settings
.
set_raise_when_wait_failed
(
False
)
class
RedisSingleton
:
class
RedisSingleton
:
_redis_pool
=
None
_redis_pool
=
None
def
__init__
(
self
,
redis_url
=
None
):
def
__init__
(
self
,
redis_url
=
None
):
self
.
redis_url
=
redis_url
self
.
redis_url
=
redis_url
async
def
get_connection
(
self
):
def
get_connection
(
self
):
if
self
.
_redis_pool
is
None
:
if
self
.
_redis_pool
is
None
:
if
self
.
redis_url
:
if
self
.
redis_url
:
self
.
_redis_pool
=
await
from_url
(
self
.
redis_url
,
decode_responses
=
True
)
self
.
_redis_pool
=
from_url
(
self
.
redis_url
,
decode_responses
=
True
)
else
:
else
:
# 默认连接地址
# 默认连接地址
self
.
_redis_pool
=
await
from_url
(
'redis://localhost'
,
decode_responses
=
True
)
self
.
_redis_pool
=
from_url
(
'redis://localhost'
,
decode_responses
=
True
)
return
self
.
_redis_pool
return
self
.
_redis_pool
class
SiteType
(
Data
):
class
SiteType
(
Data
):
com
=
1
com
=
1
de
=
2
de
=
2
async
def
callback
(
param
):
def
callback
(
param
):
"""
"""
回调接口
回调接口
"""
"""
requests
.
post
(
requests
.
post
(
f
"{DOMAIN}/api/collection/cookie"
,
f
"{DOMAIN}/api/collection/cookie"
,
headers
=
{
headers
=
{
"Content-Type"
:
"application/json"
,
"Content-Type"
:
"application/json"
,
"Accept"
:
"application/json"
,
"Accept"
:
"application/json"
,
},
},
data
=
json
.
dumps
(
param
),
data
=
json
.
dumps
(
param
),
)
)
async
def
refresh_local_cookie
(
data
:
dict
,
site
:
str
=
"com"
):
def
refresh_local_cookie
(
data
:
dict
,
site
:
str
=
"com"
):
"""
"""
刷新本地cookie
刷新本地cookie
"""
"""
redis
=
await
redis_singleton
.
get_connection
()
redis
=
redis_singleton
.
get_connection
()
await
redis
.
set
(
f
"cookie:{site}"
,
json
.
dumps
(
data
))
redis
.
set
(
f
"cookie:{site}"
,
json
.
dumps
(
data
))
async
def
get_cookie_error
():
def
get_cookie_error
():
"""
"""
获取cookie错误
获取cookie错误
:return:
:return:
"""
"""
redis
=
await
redis_singleton
.
get_connection
()
redis
=
redis_singleton
.
get_connection
()
return
await
redis
.
get
(
"amazon:cookie-error"
)
return
redis
.
get
(
"amazon:cookie-error"
)
async
def
delete_cookie_error
():
def
delete_cookie_error
():
"""
"""
删除cookie错误
删除cookie错误
:return:
:return:
"""
"""
redis
=
await
redis_singleton
.
get_connection
()
redis
=
redis_singleton
.
get_connection
()
return
await
redis
.
delete
(
"amazon:cookie-error"
)
return
redis
.
delete
(
"amazon:cookie-error"
)
async
def
input_postcode
(
def
input_postcode
(
tab
:
WebPageTab
,
postcode
:
str
,
locator
:
str
=
"#GLUXZipUpdateInput"
tab
:
WebPageTab
,
postcode
:
str
,
locator
:
str
=
"#GLUXZipUpdateInput"
):
):
location_input
=
tab
.
ele
(
locator
,
timeout
=
3
)
location_input
=
tab
.
ele
(
locator
,
timeout
=
3
)
if
location_input
is
None
:
if
location_input
is
None
:
raise
Exception
(
"未找到输入框"
)
raise
Exception
(
"未找到输入框"
)
location_input
.
input
(
postcode
)
location_input
.
input
(
postcode
)
await
asyncio
.
sleep
(
1
)
sleep
(
1
)
async
def
get_cookie
(
tab
:
WebPageTab
,
site_type
:
int
=
1
):
def
get_cookie
(
tab
:
WebPageTab
,
site_type
:
int
=
1
):
"""
"""
获取cookie
获取cookie
"""
"""
cookie
=
tab
.
cookies
()
.
as_str
()
cookie
=
tab
.
cookies
()
.
as_str
()
content
=
tab
.
html
content
=
tab
.
html
token
=
re
.
findall
(
r"name=\"stylesnap\" value=\"(.*?)\">"
,
content
)
token
=
re
.
findall
(
r"name=\"stylesnap\" value=\"(.*?)\">"
,
content
)
response
=
{
response
=
{
"cookie"
:
cookie
,
"cookie"
:
cookie
,
"token"
:
token
[
0
]
if
token
else
""
,
"token"
:
token
[
0
]
if
token
else
""
,
"user-agent"
:
UA
,
"user-agent"
:
UA
,
"time"
:
int
(
time
.
time
()),
"time"
:
int
(
time
.
time
()),
}
}
logger
.
info
(
f
"获取到cookie: {json.dumps(response)}"
)
logger
.
info
(
f
"获取到cookie: {json.dumps(response)}"
)
await
callback
({
"type"
:
site_type
,
"data"
:
response
})
callback
({
"type"
:
site_type
,
"data"
:
response
})
return
cookie
return
cookie
async
def
run
(
site
:
str
=
"com"
,
postcode
:
str
=
"20001"
,
site_type
:
int
=
1
):
def
run
(
site
:
str
=
"com"
,
postcode
:
str
=
"20001"
,
site_type
:
int
=
1
):
async
def
_close
():
def
_close
():
cookie
=
await
get_cookie
(
tab
,
site_type
)
cookie
=
get_cookie
(
tab
,
site_type
)
if
IS_DEBUG
:
if
IS_DEBUG
:
await
refresh_local_cookie
({
"cookie"
:
cookie
,
"user-agent"
:
UA
},
site
=
site
)
refresh_local_cookie
({
"cookie"
:
cookie
,
"user-agent"
:
UA
},
site
=
site
)
chromium
.
clear_cache
()
chromium
.
clear_cache
()
chromium
.
quit
()
chromium
.
quit
()
await
delete_cookie_error
()
delete_cookie_error
()
if
not
IS_DEBUG
:
if
not
IS_DEBUG
:
number
=
await
get_cookie_error
()
number
=
get_cookie_error
()
number
=
int
(
number
)
if
number
else
0
number
=
int
(
number
)
if
number
else
0
if
number
<
50
:
if
number
<
50
:
logger
.
success
(
"Cookie正常"
)
logger
.
success
(
"Cookie正常"
)
return
return
logger
.
error
(
"Cookie异常"
)
logger
.
error
(
"Cookie异常"
)
chromium
=
Chromium
()
chromium
=
Chromium
()
tab
=
chromium
.
latest_tab
tab
=
chromium
.
latest_tab
try
:
try
:
# ¤cy=JPY
# ¤cy=JPY
tab
.
get
(
f
"https://www.amazon.{site}/stylesnap?language=en_GB"
)
tab
.
get
(
f
"https://www.amazon.{site}/stylesnap?language=en_GB"
)
# 判断邮编是否正确
# 判断邮编是否正确
line
=
tab
.
ele
(
"#glow-ingress-line2"
,
timeout
=
3
)
line
=
tab
.
ele
(
"#glow-ingress-line2"
,
timeout
=
3
)
nav
=
tab
.
ele
(
"#icp-nav-flyout"
,
timeout
=
3
)
nav
=
tab
.
ele
(
"#icp-nav-flyout"
,
timeout
=
3
)
if
line
and
nav
:
if
line
and
nav
:
postal
,
lang
=
line
.
text
,
nav
.
text
postal
,
lang
=
line
.
text
,
nav
.
text
if
(
not
postal
or
postcode
not
in
postal
)
or
(
not
lang
or
not
"EN"
not
in
lang
):
if
(
not
postal
or
postcode
not
in
postal
)
or
(
not
lang
or
not
"EN"
not
in
lang
):
logger
.
info
(
"邮编或语言错误, 开始设置邮编和语言"
)
logger
.
info
(
"邮编或语言错误, 开始设置邮编和语言"
)
else
:
else
:
logger
.
info
(
"邮编和语言正确"
)
logger
.
info
(
"邮编和语言正确"
)
await
_close
()
_close
()
return
return
location
=
tab
.
ele
(
"#nav-global-location-popover-link"
,
timeout
=
3
)
location
=
tab
.
ele
(
"#nav-global-location-popover-link"
,
timeout
=
3
)
if
not
location
:
if
not
location
:
raise
Exception
(
"没有进入正确页面"
)
raise
Exception
(
"没有进入正确页面"
)
else
:
else
:
location
.
click
()
location
.
click
()
postcode_parts
=
postcode
.
split
(
"-"
)
postcode_parts
=
postcode
.
split
(
"-"
)
if
len
(
postcode_parts
)
==
2
:
if
len
(
postcode_parts
)
==
2
:
await
input_postcode
(
tab
,
postcode_parts
[
0
],
"#GLUXZipUpdateInput_0"
)
input_postcode
(
tab
,
postcode_parts
[
0
],
"#GLUXZipUpdateInput_0"
)
await
input_postcode
(
tab
,
postcode_parts
[
1
],
"#GLUXZipUpdateInput_1"
)
input_postcode
(
tab
,
postcode_parts
[
1
],
"#GLUXZipUpdateInput_1"
)
else
:
else
:
await
input_postcode
(
tab
,
postcode
)
input_postcode
(
tab
,
postcode
)
locs
=
[
locs
=
[
"#GLUXZipUpdate"
,
"#GLUXZipUpdate"
,
'xpath://*[@id="a-popover-1"]/div/header/button'
,
'xpath://*[@id="a-popover-1"]/div/header/button'
,
'xpath://*[@id="icp-nav-flyout"]/button'
,
'xpath://*[@id="icp-nav-flyout"]/button'
,
"@text()=English"
,
"@text()=English"
,
]
]
for
loc
in
locs
:
for
loc
in
locs
:
ele
=
tab
.
ele
(
loc
,
timeout
=
3
)
ele
=
tab
.
ele
(
loc
,
timeout
=
3
)
if
not
ele
:
if
not
ele
:
raise
ValueError
(
"元素定位错误"
)
raise
ValueError
(
"元素定位错误"
)
ele
.
wait
.
clickable
(
timeout
=
3
,
raise_err
=
False
)
.
click
()
ele
.
wait
.
clickable
(
timeout
=
3
,
raise_err
=
False
)
.
click
()
tab
.
wait
(
2
)
tab
.
wait
(
2
)
await
_close
()
_close
()
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
e
)
logger
.
error
(
e
)
async
def
main
():
def
main
():
if
IS_DEBUG
:
if
IS_DEBUG
:
items
=
random
.
choices
(
list
(
Site
.
zip
(
Postcode
)))
items
=
random
.
choices
(
list
(
Site
.
zip
(
Postcode
)))
else
:
else
:
items
=
Site
.
zip
(
Postcode
)
items
=
Site
.
zip
(
Postcode
)
for
site
,
postcode
in
items
:
for
site
,
postcode
in
items
:
site_type
=
SiteType
.
__dict__
.
get
(
site
)
site_type
=
SiteType
.
__dict__
.
get
(
site
)
if
site_type
is
None
:
if
site_type
is
None
:
continue
continue
logger
.
info
(
f
"开始获取cookie: {site} {postcode}"
)
logger
.
info
(
f
"开始获取cookie: {site} {postcode}"
)
await
run
(
site
,
postcode
)
run
(
site
,
postcode
)
await
asyncio
.
sleep
(
10
)
sleep
(
10
)
if
IS_DEBUG
:
if
IS_DEBUG
:
exit
()
exit
()
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
while
True
:
while
True
:
try
:
try
:
redis_singleton
=
RedisSingleton
(
redis_url
=
redis_config
[
"url"
])
redis_singleton
=
RedisSingleton
(
redis_url
=
redis_config
[
"url"
])
asyncio
.
run
(
main
())
asyncio
.
run
(
main
())
except
:
except
:
traceback
.
print_exc
()
traceback
.
print_exc
()
callback.py
View file @
1fb7b8ca
import
json
import
json
import
os
import
os
import
threading
import
threading
import
time
import
time
import
redis
import
redis
from
curl_cffi
import
requests
from
curl_cffi
import
requests
from
loguru
import
logger
from
loguru
import
logger
from
conf
import
config
from
conf
import
config
COLL_DOMAIN
=
config
[
'app'
][
'coll_domain'
]
COLL_DOMAIN
=
config
[
'app'
][
'coll_domain'
]
_redis_db
=
redis
.
Redis
.
from_url
(
config
[
'redis'
][
'url'
],
decode_responses
=
True
)
_redis_db
=
redis
.
Redis
.
from_url
(
config
[
'redis'
][
'url'
],
decode_responses
=
True
)
task_monitoring_config
=
config
[
'task-monitoring'
]
task_monitoring_config
=
config
[
'task-monitoring'
]
task_search_config
=
config
[
'task-search'
]
task_search_config
=
config
[
'task-search'
]
task_product_detail_config
=
config
[
'task-product-detail'
]
task_product_detail_config
=
config
[
'task-product-detail'
]
DEFAULT_HEADER
=
{
DEFAULT_HEADER
=
{
"Content-Type"
:
"application/json"
,
"Content-Type"
:
"application/json"
,
"Accept"
:
"application/json"
,
"Accept"
:
"application/json"
,
}
}
def
task_callback
(
data
):
def
task_callback
(
data
):
"""
"""
回调接口
回调接口
"""
"""
try
:
try
:
url
=
f
"{COLL_DOMAIN}/api/collection/task"
url
=
f
"{COLL_DOMAIN}/api/collection/task"
response
=
requests
.
post
(
url
,
headers
=
DEFAULT_HEADER
,
data
=
json
.
dumps
(
data
),
verify
=
False
)
response
=
requests
.
post
(
url
,
headers
=
DEFAULT_HEADER
,
data
=
json
.
dumps
(
data
),
verify
=
False
)
data
=
response
.
json
()
data
=
response
.
json
()
if
data
[
"code"
]
==
0
:
if
data
[
"code"
]
==
0
:
return
True
return
True
else
:
else
:
return
False
return
False
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"回调异常 : {e}"
)
logger
.
error
(
f
"回调异常 : {e}"
)
return
False
return
False
def
batch_callback
(
callback_key
:
str
):
def
batch_callback
(
callback_key
:
str
):
thread
=
[]
thread
=
[]
number
=
_redis_db
.
llen
(
callback_key
)
number
=
_redis_db
.
llen
(
callback_key
)
logger
.
info
(
f
"回调 {callback_key},共有{number}个任务"
)
logger
.
info
(
f
"回调 {callback_key},共有{number}个任务"
)
for
_
in
range
(
10
):
for
_
in
range
(
10
):
data
=
_redis_db
.
lpop
(
callback_key
)
data
=
_redis_db
.
lpop
(
callback_key
)
if
data
:
if
data
:
result
=
json
.
loads
(
data
)
result
=
json
.
loads
(
data
)
thread
.
append
(
threading
.
Thread
(
target
=
task_callback
,
args
=
(
result
,)))
thread
.
append
(
threading
.
Thread
(
target
=
task_callback
,
args
=
(
result
,)))
else
:
else
:
break
break
for
t
in
thread
:
for
t
in
thread
:
t
.
start
()
t
.
start
()
for
t
in
thread
:
for
t
in
thread
:
t
.
join
()
t
.
join
()
def
callback_task
(
callback_key
:
str
):
def
callback_task
(
callback_key
:
str
):
"""
"""
回调任务
回调任务
:param callback_key:
:param callback_key:
:return:
:return:
"""
"""
task_number
=
500
task_number
=
500
result
=
[]
result
=
[]
try
:
try
:
if
callback_key
==
task_monitoring_config
.
get
(
'item_key'
):
if
callback_key
==
task_monitoring_config
.
get
(
'item_key'
):
for
_
in
range
(
task_number
):
for
_
in
range
(
task_number
):
data
=
_redis_db
.
lpop
(
callback_key
)
data
=
_redis_db
.
lpop
(
callback_key
)
if
data
:
if
data
:
result
.
append
(
json
.
loads
(
data
))
result
.
append
(
json
.
loads
(
data
))
else
:
else
:
break
break
if
result
:
if
result
:
logger
.
info
(
f
"回调 {callback_key},共有{len(result)}个任务"
)
logger
.
info
(
f
"回调 {callback_key},共有{len(result)}个任务"
)
logger
.
info
(
f
"回调: result: {json.dumps(result)}"
)
logger
.
info
(
f
"回调: result: {json.dumps(result)}"
)
callback
=
{
callback
=
{
"data"
:
{
"data"
:
{
"error_items"
:
[],
"error_items"
:
[],
"collection"
:
result
,
"collection"
:
result
,
},
},
"type"
:
5
,
"type"
:
5
,
}
}
task_callback
(
callback
)
task_callback
(
callback
)
elif
callback_key
==
task_product_detail_config
.
get
(
'item_key'
)
or
callback_key
==
task_search_config
.
get
(
'item_key'
):
elif
callback_key
==
task_product_detail_config
.
get
(
'item_key'
)
or
callback_key
==
task_search_config
.
get
(
'item_key'
):
batch_callback
(
callback_key
)
batch_callback
(
callback_key
)
except
:
except
:
logger
.
error
(
f
"回调异常"
)
logger
.
error
(
f
"回调异常"
)
def
run
(
task_config
:
dict
=
task_monitoring_config
):
def
run
(
task_config
:
dict
=
task_monitoring_config
):
CALLBACK_PID_FILE
=
"./pid/callback.pid"
CALLBACK_PID_FILE
=
"./pid/callback.pid"
while
True
:
while
True
:
if
not
os
.
path
.
exists
(
CALLBACK_PID_FILE
):
if
not
os
.
path
.
exists
(
CALLBACK_PID_FILE
):
logger
.
error
(
'任务退出'
)
logger
.
error
(
'任务退出'
)
break
break
try
:
try
:
callback_key
=
task_config
.
get
(
'item_key'
)
callback_key
=
task_config
.
get
(
'item_key'
)
callback_task
(
callback_key
)
callback_task
(
callback_key
)
logger
.
info
(
f
"回调 {callback_key} 完成"
)
logger
.
info
(
f
"回调 {callback_key} 完成"
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"任务异常 : {e}"
)
logger
.
error
(
f
"任务异常 : {e}"
)
callback_sleep_time
=
int
(
task_config
.
get
(
'callback_sleep_time'
,
5
))
callback_sleep_time
=
int
(
task_config
.
get
(
'callback_sleep_time'
,
5
))
time
.
sleep
(
callback_sleep_time
)
time
.
sleep
(
callback_sleep_time
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
tasks
=
[]
tasks
=
[]
PID_FILES
=
[
PID_FILES
=
[
"monitoring.pid"
,
"monitoring.pid"
,
"product_detail.pid"
,
"product_detail.pid"
,
"search.pid"
,
"search.pid"
,
"callback.pid"
,
"callback.pid"
,
]
]
for
PID_FILE
in
PID_FILES
:
for
PID_FILE
in
PID_FILES
:
with
open
(
f
"./pid/{PID_FILE}"
,
'w'
)
as
f
:
with
open
(
f
"./pid/{PID_FILE}"
,
'w'
)
as
f
:
f
.
write
(
str
(
os
.
getpid
()))
f
.
write
(
str
(
os
.
getpid
()))
if
task_monitoring_config
.
get
(
'enabled'
,
None
)
==
'True'
:
if
task_monitoring_config
.
get
(
'enabled'
,
None
)
==
'True'
:
logger
.
info
(
f
"采集任务回调启动"
)
logger
.
info
(
f
"采集任务回调启动"
)
t
=
threading
.
Thread
(
target
=
run
,
args
=
(
task_monitoring_config
,))
t
=
threading
.
Thread
(
target
=
run
,
args
=
(
task_monitoring_config
,))
tasks
.
append
(
t
)
tasks
.
append
(
t
)
if
task_product_detail_config
.
get
(
'enabled'
,
None
)
==
'True'
:
if
task_product_detail_config
.
get
(
'enabled'
,
None
)
==
'True'
:
logger
.
info
(
f
"商品发布回调启动"
)
logger
.
info
(
f
"商品发布回调启动"
)
t
=
threading
.
Thread
(
target
=
run
,
args
=
(
task_product_detail_config
,))
t
=
threading
.
Thread
(
target
=
run
,
args
=
(
task_product_detail_config
,))
tasks
.
append
(
t
)
tasks
.
append
(
t
)
if
task_search_config
.
get
(
'enabled'
,
None
)
==
'True'
:
if
task_search_config
.
get
(
'enabled'
,
None
)
==
'True'
:
logger
.
info
(
f
"搜索回调启动"
)
logger
.
info
(
f
"搜索回调启动"
)
t
=
threading
.
Thread
(
target
=
run
,
args
=
(
task_search_config
,))
t
=
threading
.
Thread
(
target
=
run
,
args
=
(
task_search_config
,))
tasks
.
append
(
t
)
tasks
.
append
(
t
)
for
t
in
tasks
:
for
t
in
tasks
:
t
.
start
()
t
.
start
()
for
t
in
tasks
:
for
t
in
tasks
:
t
.
join
()
t
.
join
()
celery_app.py
View file @
1fb7b8ca
...
@@ -33,7 +33,7 @@ app.conf.task_queues = (
...
@@ -33,7 +33,7 @@ app.conf.task_queues = (
)
)
app
.
conf
.
task_routes
=
{
app
.
conf
.
task_routes
=
{
"celery_tasks.detail_spider_task"
:
{
"queue"
:
"detail"
},
"celery_tasks.detail_spider_task"
:
{
"queue"
:
"detail"
},
"celery_tasks.monitor_spider_task"
:
{
"queue"
:
"
detail
"
},
"celery_tasks.monitor_spider_task"
:
{
"queue"
:
"
monitor
"
},
"celery_tasks.search_spider_task"
:
{
"queue"
:
"search"
},
"celery_tasks.search_spider_task"
:
{
"queue"
:
"search"
},
"celery_tasks.*_dial_task"
:
{
"queue"
:
"dial"
},
"celery_tasks.*_dial_task"
:
{
"queue"
:
"dial"
},
"celery_tasks.*"
:
{
"queue"
:
"detail"
},
"celery_tasks.*"
:
{
"queue"
:
"detail"
},
...
...
db.py
View file @
1fb7b8ca
# import aioredis
# import aioredis
from
redis
.asyncio
import
from_url
from
redis
import
from_url
class
RedisSingleton
:
class
RedisSingleton
:
...
...
spider/base_detail.py
View file @
1fb7b8ca
from
time
import
sleep
from
time
import
sleep
import
asyncio
import
html
import
html
import
json
import
json
import
random
import
random
...
@@ -868,9 +867,7 @@ class Goods:
...
@@ -868,9 +867,7 @@ class Goods:
# 分批
# 分批
if
len
(
collection_skus
)
>
0
:
if
len
(
collection_skus
)
>
0
:
for
i
in
range
(
0
,
len
(
collection_skus
),
8
):
for
i
in
range
(
0
,
len
(
collection_skus
),
8
):
for
response
in
asyncio
.
gather
(
for
response
in
collection_skus
[
i
:
i
+
8
]:
*
collection_skus
[
i
:
i
+
8
]
):
try
:
try
:
if
response
.
get
(
"brand"
):
if
response
.
get
(
"brand"
):
brand
.
append
(
response
[
"brand"
])
brand
.
append
(
response
[
"brand"
])
...
@@ -1020,8 +1017,6 @@ class Goods:
...
@@ -1020,8 +1017,6 @@ class Goods:
tasks
=
[
json
.
loads
(
task
)
for
task
in
tasks
]
tasks
=
[
json
.
loads
(
task
)
for
task
in
tasks
]
for
task
in
tasks
:
for
task
in
tasks
:
queue
.
append
(
self
.
run
(
task
))
queue
.
append
(
self
.
run
(
task
))
if
queue
:
asyncio
.
gather
(
*
queue
)
logger
.
info
(
f
"任务耗时: {time.time() - start_time}"
)
logger
.
info
(
f
"任务耗时: {time.time() - start_time}"
)
if
self
.
is_debug
:
if
self
.
is_debug
:
...
...
spider/base_monitor.py
View file @
1fb7b8ca
from
time
import
sleep
from
time
import
sleep
import
asyncio
import
json
import
json
import
re
import
re
import
time
import
time
...
@@ -420,7 +419,7 @@ class Monitoring:
...
@@ -420,7 +419,7 @@ class Monitoring:
success_number
=
0
success_number
=
0
logger
.
info
(
f
"任务数: {len(queue)}"
)
logger
.
info
(
f
"任务数: {len(queue)}"
)
if
queue
:
if
queue
:
for
items
in
asyncio
.
gather
(
*
queue
)
:
for
items
in
queue
:
success_number
+=
1
success_number
+=
1
logger
.
info
(
f
"任务耗时: {time.time() - start_time}, 成功数: {success_number}"
)
logger
.
info
(
f
"任务耗时: {time.time() - start_time}, 成功数: {success_number}"
)
spider/base_search.py
View file @
1fb7b8ca
from
time
import
sleep
from
time
import
sleep
import
asyncio
import
functools
import
functools
import
json
import
json
import
os
import
os
...
@@ -650,8 +649,6 @@ class Search:
...
@@ -650,8 +649,6 @@ class Search:
tasks
=
[
json
.
loads
(
task
)
for
task
in
tasks
]
tasks
=
[
json
.
loads
(
task
)
for
task
in
tasks
]
for
task
in
tasks
:
for
task
in
tasks
:
queue
.
append
(
self
.
run
(
task
))
queue
.
append
(
self
.
run
(
task
))
if
queue
:
asyncio
.
gather
(
*
queue
)
logger
.
info
(
f
"任务耗时: {time.time() - start_time}"
)
logger
.
info
(
f
"任务耗时: {time.time() - start_time}"
)
if
self
.
is_debug
:
if
self
.
is_debug
:
...
...
task.py
View file @
1fb7b8ca
from
time
import
sleep
import
asyncio
import
asyncio
import
json
import
json
import
os
import
os
import
threading
import
threading
import
time
import
time
import
redis
import
redis
from
curl_cffi
import
requests
from
curl_cffi
import
requests
from
loguru
import
logger
from
loguru
import
logger
from
conf
import
config
from
conf
import
config
from
const
import
Site
from
const
import
Site
from
db
import
RedisSingleton
from
db
import
RedisSingleton
from
tool
import
Task
from
tool
import
Task
COLL_DOMAIN
=
config
[
'app'
][
'coll_domain'
]
COLL_DOMAIN
=
config
[
'app'
][
'coll_domain'
]
_redis_db
=
redis
.
Redis
.
from_url
(
config
[
'redis'
][
'url'
],
decode_responses
=
True
)
_redis_db
=
redis
.
Redis
.
from_url
(
config
[
'redis'
][
'url'
],
decode_responses
=
True
)
task_monitoring_config
=
config
[
'task-monitoring'
]
task_monitoring_config
=
config
[
'task-monitoring'
]
task_search_config
=
config
[
'task-search'
]
task_search_config
=
config
[
'task-search'
]
task_product_detail_config
=
config
[
'task-product-detail'
]
task_product_detail_config
=
config
[
'task-product-detail'
]
redis_config
=
config
[
'redis'
]
redis_config
=
config
[
'redis'
]
cookie_config
=
config
[
'cookie'
]
cookie_config
=
config
[
'cookie'
]
DEFAULT_HEADER
=
{
DEFAULT_HEADER
=
{
"Content-Type"
:
"application/json"
,
"Content-Type"
:
"application/json"
,
"Accept"
:
"application/json"
,
"Accept"
:
"application/json"
,
}
}
PID_FILE
=
'./pid/task.pid'
PID_FILE
=
'./pid/task.pid'
def
get_task
(
task_key
:
str
=
task_monitoring_config
[
'queue_key'
],
number
:
int
=
1
):
def
get_task
(
task_key
:
str
=
task_monitoring_config
[
'queue_key'
],
number
:
int
=
1
):
"""
"""
获取任务
获取任务
:param task_key:
:param task_key:
:param number:
:param number:
:return:
:return:
"""
"""
try
:
try
:
url
=
f
"{COLL_DOMAIN}/api/collection/task?number={number}&queue={task_key}"
url
=
f
"{COLL_DOMAIN}/api/collection/task?number={number}&queue={task_key}"
response
=
requests
.
get
(
url
,
headers
=
DEFAULT_HEADER
,
verify
=
False
)
response
=
requests
.
get
(
url
,
headers
=
DEFAULT_HEADER
,
verify
=
False
)
response
=
response
.
json
()
response
=
response
.
json
()
if
response
[
"code"
]
==
0
:
if
response
[
"code"
]
==
0
:
return
response
[
"data"
]
return
response
[
"data"
]
else
:
else
:
return
{}
return
{}
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"获取任务异常 : {e}"
)
logger
.
error
(
f
"获取任务异常 : {e}"
)
return
{}
return
{}
def
add_task
(
task_key
:
str
,
redis_key
:
str
,
task_number
:
int
):
def
add_task
(
task_key
:
str
,
redis_key
:
str
,
task_number
:
int
):
"""
"""
添加任务
添加任务
:param task_key:
:param task_key:
:param redis_key:
:param redis_key:
:param task_number:
:param task_number:
:return:
:return:
"""
"""
items
=
get_task
(
task_key
,
task_number
)
items
=
get_task
(
task_key
,
task_number
)
task_lists
=
items
.
get
(
'list'
,
[])
task_lists
=
items
.
get
(
'list'
,
[])
if
task_lists
:
if
task_lists
:
logger
.
info
(
f
"{task_key} - {len(task_lists)}个任务加入队列"
)
logger
.
info
(
f
"{task_key} - {len(task_lists)}个任务加入队列"
)
for
item
in
task_lists
:
for
item
in
task_lists
:
_redis_db
.
lpush
(
redis_key
,
json
.
dumps
(
item
))
_redis_db
.
lpush
(
redis_key
,
json
.
dumps
(
item
))
def
run
(
task_config
:
dict
=
task_monitoring_config
):
def
run
(
task_config
:
dict
=
task_monitoring_config
):
while
True
:
while
True
:
if
not
os
.
path
.
exists
(
PID_FILE
):
if
not
os
.
path
.
exists
(
PID_FILE
):
logger
.
error
(
'任务退出'
)
logger
.
error
(
'任务退出'
)
break
break
add_task_enabled
=
task_config
.
get
(
'add_task_enabled'
)
add_task_enabled
=
task_config
.
get
(
'add_task_enabled'
)
task_number
=
int
(
task_config
.
get
(
'number'
))
task_number
=
int
(
task_config
.
get
(
'number'
))
task_min_number
=
int
(
task_config
.
get
(
'min_number'
))
task_min_number
=
int
(
task_config
.
get
(
'min_number'
))
redis_key
=
task_config
.
get
(
'task_key'
)
redis_key
=
task_config
.
get
(
'task_key'
)
todo_task_count
=
_redis_db
.
llen
(
redis_key
)
todo_task_count
=
_redis_db
.
llen
(
redis_key
)
task_key
=
task_config
.
get
(
'queue_key'
)
task_key
=
task_config
.
get
(
'queue_key'
)
try
:
try
:
task_keys
=
task_key
.
split
(
','
)
task_keys
=
task_key
.
split
(
','
)
logger
.
info
(
f
"{redis_key}任务队列长度 : {todo_task_count}"
)
logger
.
info
(
f
"{redis_key}任务队列长度 : {todo_task_count}"
)
if
todo_task_count
<=
task_min_number
and
add_task_enabled
==
'True'
:
if
todo_task_count
<=
task_min_number
and
add_task_enabled
==
'True'
:
for
key
in
task_keys
:
for
key
in
task_keys
:
add_task
(
key
,
redis_key
,
task_number
)
add_task
(
key
,
redis_key
,
task_number
)
except
KeyboardInterrupt
:
except
KeyboardInterrupt
:
logger
.
error
(
f
"任务终止"
)
logger
.
error
(
f
"任务终止"
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"任务异常 : {redis_key} : {e}"
)
logger
.
error
(
f
"任务异常 : {redis_key} : {e}"
)
time
.
sleep
(
5
)
time
.
sleep
(
5
)
async
def
cookie
():
def
cookie
():
for
site
in
Site
.
values
():
for
site
in
Site
.
values
():
time_key
=
cookie_config
[
'cookie_time_key'
]
time_key
=
cookie_config
[
'cookie_time_key'
]
time_key
=
f
"{time_key}:{site}"
time_key
=
f
"{time_key}:{site}"
_redis_db
.
delete
(
time_key
)
_redis_db
.
delete
(
time_key
)
while
True
:
while
True
:
if
not
os
.
path
.
exists
(
PID_FILE
):
if
not
os
.
path
.
exists
(
PID_FILE
):
logger
.
error
(
'任务退出'
)
logger
.
error
(
'任务退出'
)
break
break
logger
.
info
(
f
"获取cookie"
)
logger
.
info
(
f
"获取cookie"
)
for
site
in
Site
.
values
():
for
site
in
Site
.
values
():
try
:
try
:
await
task_manager
.
get_cookie
(
site
)
task_manager
.
get_cookie
(
site
)
except
:
except
:
logger
.
error
(
f
"获取cookie异常"
)
logger
.
error
(
f
"获取cookie异常"
)
await
asyncio
.
sleep
(
5
)
sleep
(
5
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
tasks
=
[]
tasks
=
[]
redis_singleton
=
RedisSingleton
(
redis_url
=
redis_config
[
'url'
])
redis_singleton
=
RedisSingleton
(
redis_url
=
redis_config
[
'url'
])
task_manager
=
Task
(
redis_singleton
)
task_manager
=
Task
(
redis_singleton
)
with
open
(
PID_FILE
,
'w'
)
as
f
:
with
open
(
PID_FILE
,
'w'
)
as
f
:
f
.
write
(
str
(
os
.
getpid
()))
f
.
write
(
str
(
os
.
getpid
()))
if
task_monitoring_config
.
get
(
'enabled'
,
None
)
==
'True'
:
if
task_monitoring_config
.
get
(
'enabled'
,
None
)
==
'True'
:
logger
.
info
(
f
"监控任务添加"
)
logger
.
info
(
f
"监控任务添加"
)
t
=
threading
.
Thread
(
target
=
run
,
args
=
(
task_monitoring_config
,))
t
=
threading
.
Thread
(
target
=
run
,
args
=
(
task_monitoring_config
,))
tasks
.
append
(
t
)
tasks
.
append
(
t
)
else
:
else
:
logger
.
info
(
f
"监控任务未启动"
)
logger
.
info
(
f
"监控任务未启动"
)
if
task_product_detail_config
.
get
(
'enabled'
,
None
)
==
'True'
:
if
task_product_detail_config
.
get
(
'enabled'
,
None
)
==
'True'
:
logger
.
info
(
f
"发布任务添加"
)
logger
.
info
(
f
"发布任务添加"
)
t
=
threading
.
Thread
(
target
=
run
,
args
=
(
task_product_detail_config
,))
t
=
threading
.
Thread
(
target
=
run
,
args
=
(
task_product_detail_config
,))
tasks
.
append
(
t
)
tasks
.
append
(
t
)
else
:
else
:
logger
.
info
(
f
"发布任务未启动"
)
logger
.
info
(
f
"发布任务未启动"
)
if
task_search_config
.
get
(
'enabled'
,
None
)
==
'True'
:
if
task_search_config
.
get
(
'enabled'
,
None
)
==
'True'
:
logger
.
info
(
f
"搜索任务添加"
)
logger
.
info
(
f
"搜索任务添加"
)
t
=
threading
.
Thread
(
target
=
run
,
args
=
(
task_search_config
,))
t
=
threading
.
Thread
(
target
=
run
,
args
=
(
task_search_config
,))
tasks
.
append
(
t
)
tasks
.
append
(
t
)
else
:
else
:
logger
.
info
(
f
"搜索任务未启动"
)
logger
.
info
(
f
"搜索任务未启动"
)
t
=
threading
.
Thread
(
target
=
asyncio
.
run
,
args
=
(
cookie
(),))
t
=
threading
.
Thread
(
target
=
asyncio
.
run
,
args
=
(
cookie
(),))
tasks
.
append
(
t
)
tasks
.
append
(
t
)
for
t
in
tasks
:
for
t
in
tasks
:
t
.
start
()
t
.
start
()
for
t
in
tasks
:
for
t
in
tasks
:
t
.
join
()
t
.
join
()
tool.py
View file @
1fb7b8ca
...
@@ -7,7 +7,7 @@ from datetime import datetime, timedelta
...
@@ -7,7 +7,7 @@ from datetime import datetime, timedelta
from
babel.dates
import
get_month_names
,
get_day_names
from
babel.dates
import
get_month_names
,
get_day_names
from
curl_cffi
import
requests
from
curl_cffi
import
requests
from
curl_cffi.requests
import
Async
Session
from
curl_cffi.requests
import
Session
from
dateutil
import
parser
from
dateutil
import
parser
from
dateutil.relativedelta
import
relativedelta
from
dateutil.relativedelta
import
relativedelta
from
loguru
import
logger
from
loguru
import
logger
...
@@ -145,7 +145,7 @@ class Request:
...
@@ -145,7 +145,7 @@ class Request:
:param url:
:param url:
:return:
:return:
"""
"""
with
AsyncSession
(
max_clients
=
50
)
as
s
:
with
Session
(
)
as
s
:
# 清空 请求的值
# 清空 请求的值
s
.
headers
.
clear
()
s
.
headers
.
clear
()
s
.
cookies
.
clear
()
s
.
cookies
.
clear
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment