mirror of
https://github.com/ProudMuBai/GoFilm.git
synced 2026-05-15 01:27:30 +08:00
collect optimization
This commit is contained in:
@@ -24,12 +24,11 @@ func FilmSourceInit() {
|
||||
{Id: util.GenerateSalt(), Name: "HD(LZ)", Uri: `https://cj.lziapi.com/api.php/provide/vod/`, ResultModel: system.JsonResult, Grade: system.SlaveCollect, SyncPictures: false, CollectType: system.CollectVideo, State: false},
|
||||
{Id: util.GenerateSalt(), Name: "HD(BF)", Uri: `https://bfzyapi.com/api.php/provide/vod/`, ResultModel: system.JsonResult, Grade: system.SlaveCollect, SyncPictures: false, CollectType: system.CollectVideo, State: false, Interval: 2500},
|
||||
{Id: util.GenerateSalt(), Name: "HD(FF)", Uri: `http://cj.ffzyapi.com/api.php/provide/vod/`, ResultModel: system.JsonResult, Grade: system.SlaveCollect, SyncPictures: false, CollectType: system.CollectVideo, State: false},
|
||||
{Id: util.GenerateSalt(), Name: "HD(OK)", Uri: `https://okzyapi.com/api.php/provide/vod/`, ResultModel: system.JsonResult, Grade: system.SlaveCollect, SyncPictures: false, CollectType: system.CollectVideo, State: false},
|
||||
{Id: util.GenerateSalt(), Name: "HD(HM)", Uri: `https://json.heimuer.xyz/api.php/provide/vod/`, ResultModel: system.JsonResult, Grade: system.SlaveCollect, SyncPictures: false, CollectType: system.CollectVideo, State: false},
|
||||
{Id: util.GenerateSalt(), Name: "HD(OK)", Uri: `https://api.okzyw.net/api.php/provide/vod/`, ResultModel: system.JsonResult, Grade: system.SlaveCollect, SyncPictures: false, CollectType: system.CollectVideo, State: false},
|
||||
{Id: util.GenerateSalt(), Name: "HD(LY)", Uri: `https://360zy.com/api.php/provide/vod/at/json`, ResultModel: system.JsonResult, Grade: system.SlaveCollect, SyncPictures: false, CollectType: system.CollectVideo, State: false},
|
||||
{Id: util.GenerateSalt(), Name: "HD(SN)", Uri: `https://suoniapi.com/api.php/provide/vod/from/snm3u8/`, ResultModel: system.JsonResult, Grade: system.SlaveCollect, SyncPictures: false, CollectType: system.CollectVideo, State: false, Interval: 2000},
|
||||
{Id: util.GenerateSalt(), Name: "HD(DB)", Uri: `https://caiji.dbzy.tv/api.php/provide/vod/from/dbm3u8/at/josn/`, ResultModel: system.JsonResult, Grade: system.SlaveCollect, SyncPictures: false, CollectType: system.CollectVideo, State: false},
|
||||
{Id: util.GenerateSalt(), Name: "HD(IK)", Uri: `https://ikunzyapi.com/api.php/provide/vod/at/json`, ResultModel: system.JsonResult, Grade: system.SlaveCollect, SyncPictures: false, CollectType: system.CollectVideo, State: false},
|
||||
{Id: util.GenerateSalt(), Name: "HD(TT)", Uri: `https://caiji.dyttzyapi.com/api.php/provide/vod/at/json/`, ResultModel: system.JsonResult, Grade: system.SlaveCollect, SyncPictures: false, CollectType: system.CollectVideo, State: false},
|
||||
}
|
||||
err := system.SaveCollectSourceList(l)
|
||||
if err != nil {
|
||||
|
||||
@@ -3,7 +3,6 @@ package conver
|
||||
import (
|
||||
"encoding/xml"
|
||||
"log"
|
||||
"regexp"
|
||||
"server/config"
|
||||
"server/model/collect"
|
||||
"server/model/system"
|
||||
@@ -77,7 +76,6 @@ func ConvertFilmDetail(detail collect.FilmDetail) system.MovieDetail {
|
||||
1.对常见分割符进行统一化处理
|
||||
2.如果演员和导演名单过长,则进行截断, 最多只保留3个
|
||||
*/
|
||||
detail.VodActor = regexp.MustCompile(`[$&#%]`).ReplaceAllString(detail.VodActor, ",")
|
||||
|
||||
md := system.MovieDetail{
|
||||
Mid: detail.VodID,
|
||||
@@ -113,7 +111,7 @@ func ConvertFilmDetail(detail collect.FilmDetail) system.MovieDetail {
|
||||
md.PlayFrom = strings.Split(detail.VodPlayFrom, detail.VodPlayNote)
|
||||
// v2 只保留m3u8播放源
|
||||
md.PlayList = GenFilmPlayList(detail.VodPlayURL, detail.VodPlayNote)
|
||||
md.DownloadList = GenFilmPlayList(detail.VodDownURL, detail.VodPlayNote)
|
||||
//md.DownloadList = GenFilmPlayList(detail.VodDownURL, detail.VodPlayNote)
|
||||
|
||||
return md
|
||||
}
|
||||
|
||||
@@ -13,41 +13,40 @@ import (
|
||||
func CovertFilmDetailVo(fd system.FilmDetailVo) (system.MovieDetail, error) {
|
||||
t, err := time.ParseInLocation(time.DateTime, fd.AddTime, time.Local)
|
||||
md := system.MovieDetail{
|
||||
Id: fd.Id,
|
||||
Cid: fd.Cid,
|
||||
Pid: fd.Pid,
|
||||
Name: fd.Name,
|
||||
Picture: fd.Picture,
|
||||
DownFrom: fd.DownFrom,
|
||||
MovieDescriptor: system.MovieDescriptor{
|
||||
SubTitle: fd.SubTitle,
|
||||
CName: fd.CName,
|
||||
EnName: fd.EnName,
|
||||
Initial: fd.Initial,
|
||||
ClassTag: fd.ClassTag,
|
||||
Actor: fd.Actor,
|
||||
Director: fd.Director,
|
||||
Writer: fd.Writer,
|
||||
Blurb: fd.Content,
|
||||
Remarks: fd.Remarks,
|
||||
ReleaseDate: fd.ReleaseDate,
|
||||
Area: fd.Area,
|
||||
Language: fd.Language,
|
||||
Year: fd.Year,
|
||||
State: fd.State,
|
||||
UpdateTime: fd.UpdateTime,
|
||||
AddTime: t.Unix(),
|
||||
DbId: fd.DbId,
|
||||
DbScore: fd.DbScore,
|
||||
Hits: fd.Hits,
|
||||
Content: fd.Content,
|
||||
},
|
||||
Mid: fd.Id,
|
||||
Cid: fd.Cid,
|
||||
Pid: fd.Pid,
|
||||
Name: fd.Name,
|
||||
Picture: fd.Picture,
|
||||
SubTitle: fd.SubTitle,
|
||||
CName: fd.CName,
|
||||
EnName: fd.EnName,
|
||||
Initial: fd.Initial,
|
||||
ClassTag: fd.ClassTag,
|
||||
Actor: fd.Actor,
|
||||
Director: fd.Director,
|
||||
Writer: fd.Writer,
|
||||
Blurb: fd.Content,
|
||||
Remarks: fd.Remarks,
|
||||
ReleaseDate: fd.ReleaseDate,
|
||||
Area: fd.Area,
|
||||
Language: fd.Language,
|
||||
Year: fd.Year,
|
||||
State: fd.State,
|
||||
UpdateTime: fd.UpdateTime,
|
||||
AddTime: t.Unix(),
|
||||
DbId: fd.DbId,
|
||||
DbScore: fd.DbScore,
|
||||
Hits: fd.Hits,
|
||||
Content: fd.Content,
|
||||
PlayFrom: fd.PlayFrom,
|
||||
DownFrom: fd.DownFrom,
|
||||
}
|
||||
// 通过分割符切分播放源信息 PlaySeparator $$$
|
||||
//md.PlayFrom = strings.Split(fd.VodPlayFrom, fd.VodPlayNote)
|
||||
// v2 只保留m3u8播放源
|
||||
md.PlayList = GenFilmPlayList(fd.PlayLink, "$$$")
|
||||
//md.DownloadList = GenFilmPlayList(fd.DownloadLink, fd.VodPlayNote)
|
||||
|
||||
md.DownloadList = GenFilmPlayList(fd.DownloadLink, "$$$")
|
||||
return md, err
|
||||
}
|
||||
|
||||
@@ -2,14 +2,15 @@ package util
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/gocolly/colly/v2"
|
||||
"github.com/gocolly/colly/v2/extensions"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/gocolly/colly/v2"
|
||||
"github.com/gocolly/colly/v2/extensions"
|
||||
)
|
||||
|
||||
/*
|
||||
@@ -29,6 +30,18 @@ type RequestInfo struct {
|
||||
Err string `json:"err"` // 错误信息
|
||||
}
|
||||
|
||||
// CopyRequestInfo 属性复制, 隔离地址引用造成的并发问题
|
||||
func CopyRequestInfo(r RequestInfo) RequestInfo {
|
||||
// 初始化返回值
|
||||
newInfo := RequestInfo{Uri: r.Uri, Params: url.Values{}}
|
||||
|
||||
// 循环拷贝r的每个k,v
|
||||
for k, v := range r.Params {
|
||||
newInfo.Params[k] = append([]string(nil), v...)
|
||||
}
|
||||
return newInfo
|
||||
}
|
||||
|
||||
// RefererUrl 记录上次请求的url
|
||||
var RefererUrl string
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ package util
|
||||
import (
|
||||
"crypto/md5"
|
||||
"crypto/rand"
|
||||
"crypto/rsa"
|
||||
"crypto/x509"
|
||||
"encoding/hex"
|
||||
"encoding/pem"
|
||||
@@ -61,24 +60,24 @@ func PasswordEncrypt(password, salt string) string {
|
||||
}
|
||||
|
||||
// ParsePriKeyBytes 解析私钥
|
||||
func ParsePriKeyBytes(buf []byte) (*rsa.PrivateKey, error) {
|
||||
func ParsePriKeyBytes(buf []byte) (any, error) {
|
||||
p := &pem.Block{}
|
||||
p, buf = pem.Decode(buf)
|
||||
if p == nil {
|
||||
return nil, errors.New("private key parse error")
|
||||
}
|
||||
return x509.ParsePKCS1PrivateKey(p.Bytes)
|
||||
return x509.ParsePKCS8PrivateKey(p.Bytes)
|
||||
}
|
||||
|
||||
// ParsePubKeyBytes 解析公钥
|
||||
func ParsePubKeyBytes(buf []byte) (*rsa.PublicKey, error) {
|
||||
func ParsePubKeyBytes(buf []byte) (any, error) {
|
||||
p, _ := pem.Decode(buf)
|
||||
if p == nil {
|
||||
return nil, errors.New("parse publicKey content nil")
|
||||
}
|
||||
pubKey, err := x509.ParsePKCS1PublicKey(p.Bytes)
|
||||
pubKey, err := x509.ParsePKIXPublicKey(p.Bytes)
|
||||
if err != nil {
|
||||
return nil, errors.New("x509.ParsePKCS1PublicKey error")
|
||||
return nil, errors.New("x509.ParsePKIXPublicKey error")
|
||||
}
|
||||
return pubKey, nil
|
||||
}
|
||||
@@ -130,7 +129,7 @@ func ValidPwd(s string) error {
|
||||
// TruncateBySep 截断字符串,保留指定数量的结果
|
||||
func TruncateBySep(s string, limit int) string {
|
||||
// 如果保留数量小于等于0则返回空值
|
||||
if limit <= 0 {
|
||||
if len(s) <= 0 || limit <= 0 {
|
||||
return ""
|
||||
}
|
||||
// 先强制对不同的分割符进行统一替换为 ,
|
||||
@@ -139,6 +138,9 @@ func TruncateBySep(s string, limit int) string {
|
||||
// Split 会在分隔符连续出现或出现在首尾时产生空字符串,这通常符合预期
|
||||
parts := strings.Split(s, ",")
|
||||
// 片段数量小于或等于限制,直接返回原字符串
|
||||
if len(parts) <= limit {
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
// 返回原字符串是为了保留原始的格式(比如末尾是否有分隔符)
|
||||
// 即使不截断也重新 Join 一遍(去除多余的空片段等)
|
||||
return strings.Join(parts[:limit], ",")
|
||||
|
||||
@@ -26,8 +26,9 @@ func InitMysql() (err error) {
|
||||
SingularTable: true, //是否使用 结构体名称作为表名 (关闭自动变复数)
|
||||
//NameReplacer: strings.NewReplacer("spider_", ""), // 替表名和字段中的 Me 为 空
|
||||
},
|
||||
Logger: logger.Default.LogMode(logger.Warn), //设置日志级别为Info
|
||||
//Logger: logger.Default.LogMode(logger.Info), //设置日志级别为Info
|
||||
//Logger: logger.Default.LogMode(logger.Warn), //设置日志级别为Info
|
||||
Logger: logger.Default.LogMode(logger.Info), //设置日志级别为Info
|
||||
//Logger: logger.Default.LogMode(logger.Error), //设置日志级别为Info
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
"server/model/system"
|
||||
"server/plugin/common/conver"
|
||||
"server/plugin/common/util"
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -46,14 +47,22 @@ func HandleCollect(id string, h int) error {
|
||||
|
||||
// 生成 RequestInfo
|
||||
r := util.RequestInfo{Uri: s.Uri, Params: url.Values{}}
|
||||
// 如果 h == 0 则直接返回错误信息
|
||||
if h == 0 {
|
||||
log.Println(" Collect time cannot be zero ")
|
||||
return errors.New(" Collect time cannot be zer ")
|
||||
}
|
||||
// 如果 h = -1 则进行全量采集
|
||||
if h > 0 {
|
||||
// 通过 采集时长 h 的不同来执行不同出处理方式
|
||||
switch {
|
||||
case h < 0:
|
||||
// 采集时长为负数则先执行对应数据表的重置
|
||||
if s.Grade == system.MasterCollect {
|
||||
system.FilmZero()
|
||||
} else {
|
||||
// 如果所处站点是次级站点, 则删除对应站点在表中的数据
|
||||
system.ResetSlaveMovieInfoTable()
|
||||
}
|
||||
case h > 0:
|
||||
// 如果采集时长是正常数值, 则设置参数 h
|
||||
r.Params.Set("h", fmt.Sprint(h))
|
||||
default:
|
||||
log.Println("Params Collect time Exception !!!")
|
||||
return errors.New(" Params Collect time Exception !!! ")
|
||||
}
|
||||
// 2. 首先获取分页采集的页数
|
||||
pageCount, err := spiderCore.GetPageCount(r)
|
||||
@@ -86,22 +95,169 @@ func HandleCollect(id string, h int) error {
|
||||
// 如果分页数量较大则开启协程
|
||||
ConcurrentPageSpider(pageCount, s, h, collectFilm)
|
||||
}
|
||||
// 视频数据采集完成后同步相关信息到mysql
|
||||
// 视频数据采集完成后 对暂存数据进行处理和优化
|
||||
if s.Grade == system.MasterCollect {
|
||||
// 执行影片信息更新操作
|
||||
if h > 0 {
|
||||
// 执行数据更新操作
|
||||
system.SyncSearchInfo(1)
|
||||
} else {
|
||||
// 清空searchInfo中的数据并重新添加, 否则执行
|
||||
system.SyncSearchInfo(0)
|
||||
// 如果采集时长为负, (全量采集), 则在数据采集完成后为search表添加索引
|
||||
if h < 0 {
|
||||
// 全量采集时进行数据同步(仅保存)
|
||||
system.SyncMovieDetail(s.Id, s.Grade)
|
||||
system.AddSearchIndex()
|
||||
}
|
||||
// 采集时长在一定阈值内时执行redis数据同步 (存在则更新, 不存在则新增)
|
||||
|
||||
// 开启图片同步
|
||||
if s.SyncPictures {
|
||||
system.SyncFilmPicture()
|
||||
}
|
||||
// 每次成功执行完都清理redis中的相关API接口数据缓存
|
||||
ClearCache()
|
||||
} else if s.Grade == system.SlaveCollect {
|
||||
// 如果采集时长为负, (全量采集), 则在数据采集完成后为search表添加索引
|
||||
if h < 0 {
|
||||
// 全量采集时进行数据同步
|
||||
system.SyncMovieDetail(s.Id, s.Grade)
|
||||
}
|
||||
}
|
||||
|
||||
case system.CollectArticle, system.CollectActor, system.CollectRole, system.CollectWebSite:
|
||||
log.Println("暂未开放此采集功能!!!")
|
||||
return errors.New("暂未开放此采集功能")
|
||||
}
|
||||
log.Println("Spider Task Exercise Success")
|
||||
return nil
|
||||
}
|
||||
|
||||
func HandleCollectRefine(id string, h int) error {
|
||||
// 1. 首先通过ID获取对应采集站信息
|
||||
s := system.FindCollectSourceById(id)
|
||||
if s == nil {
|
||||
log.Println("Cannot Find Collect Source Site")
|
||||
return errors.New(" Cannot Find Collect Source Site ")
|
||||
} else if !s.State {
|
||||
log.Println(" The acquisition site was disabled ")
|
||||
return errors.New(" The acquisition site was disabled ")
|
||||
}
|
||||
// 如果是主站点且状态为启用则先获取分类tree信息
|
||||
if s.Grade == system.MasterCollect && s.State {
|
||||
// 是否存在分类树信息, 不存在则获取
|
||||
if !system.ExistsCategoryTree() {
|
||||
CollectCategory(s)
|
||||
}
|
||||
}
|
||||
// 生成 RequestInfo
|
||||
r := util.RequestInfo{Uri: s.Uri, Params: url.Values{}}
|
||||
// 通过 采集时长 h 的不同来执行不同前置出处理方式
|
||||
switch {
|
||||
case h < 0:
|
||||
// 采集时长为负数则先执行对应数据表的重置
|
||||
if s.Grade == system.MasterCollect {
|
||||
system.FilmZero()
|
||||
} else {
|
||||
// 如果所处站点是次级站点, 则删除对应站点在表中的数据
|
||||
system.ResetSlaveMovieInfoTable()
|
||||
}
|
||||
case h > 0:
|
||||
// 如果采集时长是正常数值, 则设置参数 h
|
||||
r.Params.Set("h", fmt.Sprint(h))
|
||||
default:
|
||||
log.Println("Params Collect time Exception !!!")
|
||||
return errors.New(" Params Collect time Exception !!! ")
|
||||
}
|
||||
// 通过采集类型分别执行不同的采集方法
|
||||
switch s.CollectType {
|
||||
case system.CollectVideo:
|
||||
// 采集视频资源 根据采集站类型进行不同逻辑
|
||||
switch s.Grade {
|
||||
case system.MasterCollect:
|
||||
// 获取展示的分类切片信息
|
||||
cl := system.GetRevealCategoryList()
|
||||
for _, c := range cl {
|
||||
// 获取分类采集页数
|
||||
r.Params.Set("t", fmt.Sprint(c.Id))
|
||||
pageCount, err := spiderCore.GetPageCount(r)
|
||||
if err != nil {
|
||||
// 如果第二次获取分页页数依旧获取失败则关闭当前采集任务
|
||||
pageCount, err = spiderCore.GetPageCount(r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// 如果采集源参数中采集间隔参数大于500ms,则使用单线程采集
|
||||
if s.Interval > 500 {
|
||||
// 少量数据不开启协程
|
||||
for i := 1; i <= pageCount; i++ {
|
||||
// 设置采集参数pg
|
||||
r.Params.Set("pg", fmt.Sprint(i))
|
||||
collectFilmRefine(s, r)
|
||||
// 执行一次采集后休眠指定时长
|
||||
time.Sleep(time.Duration(s.Interval) * time.Millisecond)
|
||||
}
|
||||
} else if pageCount <= config.MAXGoroutine*5 {
|
||||
// 少量数据不开启协程
|
||||
for i := 1; i <= pageCount; i++ {
|
||||
r.Params.Set("pg", fmt.Sprint(i))
|
||||
collectFilmRefine(s, r)
|
||||
}
|
||||
} else {
|
||||
// 如果分页数量较大则开启协程
|
||||
collectFilmMT(pageCount, s, r, collectFilmRefine)
|
||||
}
|
||||
}
|
||||
case system.SlaveCollect:
|
||||
pageCount, err := spiderCore.GetPageCount(r)
|
||||
if err != nil {
|
||||
// 如果第二次获取分页页数依旧获取失败则关闭当前采集任务
|
||||
pageCount, err = spiderCore.GetPageCount(r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// 如果采集源参数中采集间隔参数大于500ms,则使用单线程采集
|
||||
if s.Interval > 500 {
|
||||
// 少量数据不开启协程
|
||||
for i := 1; i <= pageCount; i++ {
|
||||
// 设置采集参数pg
|
||||
r.Params.Set("pg", fmt.Sprint(i))
|
||||
collectFilmRefine(s, r)
|
||||
// 执行一次采集后休眠指定时长
|
||||
time.Sleep(time.Duration(s.Interval) * time.Millisecond)
|
||||
}
|
||||
} else if pageCount <= config.MAXGoroutine*5 {
|
||||
// 少量数据不开启协程
|
||||
for i := 1; i <= pageCount; i++ {
|
||||
r.Params.Set("pg", fmt.Sprint(i))
|
||||
collectFilmRefine(s, r)
|
||||
}
|
||||
} else {
|
||||
// 如果分页数量较大则开启协程
|
||||
collectFilmMT(pageCount, s, r, collectFilmRefine)
|
||||
}
|
||||
}
|
||||
|
||||
// 视频数据采集完成后 对暂存数据进行处理和优化
|
||||
if s.Grade == system.MasterCollect {
|
||||
// 如果采集时长为负, (全量采集), 则在数据采集完成后为search表添加索引
|
||||
if h < 0 {
|
||||
// 全量采集时进行数据同步以及添加索引(仅保存)
|
||||
system.SyncMovieDetail(s.Id, s.Grade)
|
||||
system.AddSearchIndex()
|
||||
system.AddMovieDetailIndex()
|
||||
}
|
||||
// 采集时长在一定阈值内时执行redis数据同步 (存在则更新, 不存在则新增)
|
||||
|
||||
// 开启图片同步
|
||||
if s.SyncPictures {
|
||||
system.SyncFilmPicture()
|
||||
}
|
||||
// 每次成功执行完都清理redis中的相关API接口数据缓存
|
||||
ClearCache()
|
||||
} else if s.Grade == system.SlaveCollect {
|
||||
// 如果采集时长为负, (全量采集), 则在数据采集完成后为search表添加索引
|
||||
if h < 0 {
|
||||
// 全量采集时进行数据同步
|
||||
system.SyncMovieDetail(s.Id, s.Grade)
|
||||
system.AddSlaveMovieInfoIndex()
|
||||
}
|
||||
}
|
||||
|
||||
case system.CollectArticle, system.CollectActor, system.CollectRole, system.CollectWebSite:
|
||||
@@ -149,10 +305,23 @@ func collectFilm(s *system.FilmSource, h, pg int) {
|
||||
// 通过采集站 Grade 类型, 执行不同的存储逻辑
|
||||
switch s.Grade {
|
||||
case system.MasterCollect:
|
||||
// 主站点 保存完整影片详情信息到 redis
|
||||
if err = system.SaveDetails(list); err != nil {
|
||||
// 将数据缓存到redis中
|
||||
if err = system.MovieDetailCache(list); err != nil {
|
||||
log.Println("SaveDetails Error: ", err)
|
||||
}
|
||||
//break
|
||||
// 如果 采集时长 h 小于阈值, 则将主体数据缓存到redis
|
||||
//if h > 0 && h < config.FilmSaveCacheThreshold {
|
||||
// // 主站点 执行保存或更新
|
||||
// if err = system.BatchUpdateDetails(list); err != nil {
|
||||
// log.Println("SaveDetails Error: ", err)
|
||||
// }
|
||||
//} else {
|
||||
// // 主站点 从零开始只执行保存逻辑
|
||||
// if err = system.MovieDetailCache(list); err != nil {
|
||||
// log.Println("SaveDetails Error: ", err)
|
||||
// }
|
||||
//}
|
||||
// 如果主站点开启了图片同步, 则将图片url以及对应的mid存入ZSet集合中
|
||||
if s.SyncPictures {
|
||||
if err = system.SaveVirtualPic(conver.ConvertVirtualPicture(list)); err != nil {
|
||||
@@ -160,10 +329,22 @@ func collectFilm(s *system.FilmSource, h, pg int) {
|
||||
}
|
||||
}
|
||||
case system.SlaveCollect:
|
||||
// 附属站点 仅保存影片播放信息到redis
|
||||
if err = system.SaveSitePlayList(s.Id, list); err != nil {
|
||||
// 将采集数据缓存到redis中
|
||||
if err = system.SlaveDetailCache(s.Id, list); err != nil {
|
||||
log.Println("SaveDetails Error: ", err)
|
||||
}
|
||||
//if h > 0 && h < config.FilmSaveCacheThreshold {
|
||||
// // 附属站点 仅保存影片播放信息到mysql
|
||||
// if err = system.UpdateSitePlayList(s.Id, list); err != nil {
|
||||
// log.Println("SaveDetails Error: ", err)
|
||||
// }
|
||||
//} else {
|
||||
// // 附属站点 仅保存影片播放信息到mysql
|
||||
// if err = system.SlaveDetailCache(s.Id, list); err != nil {
|
||||
// log.Println("SaveDetails Error: ", err)
|
||||
// }
|
||||
//}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -196,7 +377,42 @@ func collectFilmById(ids string, s *system.FilmSource) {
|
||||
}
|
||||
case system.SlaveCollect:
|
||||
// 附属站点 仅保存影片播放信息到redis
|
||||
if err = system.SaveSitePlayList(s.Id, list); err != nil {
|
||||
if err = system.UpdateSitePlayList(s.Id, list); err != nil {
|
||||
log.Println("SaveDetails Error: ", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 影片信息采集, 改进版
|
||||
func collectFilmRefine(s *system.FilmSource, r util.RequestInfo) {
|
||||
// 执行采集方法 获取影片详情list
|
||||
//log.Printf("%s?%s", r.Uri, r.Params.Encode())
|
||||
list, err := spiderCore.GetFilmDetail(r)
|
||||
if err != nil || len(list) <= 0 {
|
||||
// 添加采集失败记录
|
||||
pg, _ := strconv.Atoi(r.Params.Get("pg"))
|
||||
h, _ := strconv.Atoi(r.Params.Get("h"))
|
||||
fr := system.FailureRecord{OriginId: s.Id, OriginName: s.Name, Uri: s.Uri, CollectType: system.CollectVideo, PageNumber: pg, Hour: h, Cause: fmt.Sprintln(err), Status: 1}
|
||||
system.SaveFailureRecord(fr)
|
||||
log.Println("GetMovieDetail Error: ", err)
|
||||
return
|
||||
}
|
||||
// 通过采集站 Grade 类型, 执行不同的存储逻辑
|
||||
switch s.Grade {
|
||||
case system.MasterCollect:
|
||||
// 将数据缓存到redis中
|
||||
if err = system.MovieDetailCache(list); err != nil {
|
||||
log.Println("SaveDetails Error: ", err)
|
||||
}
|
||||
// 如果主站点开启了图片同步, 则将图片url以及对应的mid存入ZSet集合中
|
||||
if s.SyncPictures {
|
||||
if err = system.SaveVirtualPic(conver.ConvertVirtualPicture(list)); err != nil {
|
||||
log.Println("SaveVirtualPic Error: ", err)
|
||||
}
|
||||
}
|
||||
case system.SlaveCollect:
|
||||
// 将采集数据缓存到redis中
|
||||
if err = system.SlaveDetailCache(s.Id, list); err != nil {
|
||||
log.Println("SaveDetails Error: ", err)
|
||||
}
|
||||
}
|
||||
@@ -235,6 +451,46 @@ func ConcurrentPageSpider(capacity int, s *system.FilmSource, h int, collectFunc
|
||||
}
|
||||
}
|
||||
|
||||
// collectFilmMT 并发采集影片信息
|
||||
func collectFilmMT(capacity int, s *system.FilmSource, r util.RequestInfo, collectFunc func(s *system.FilmSource, r util.RequestInfo)) {
|
||||
// 初始化 channel, 容量为 capacity
|
||||
ch := make(chan int, capacity)
|
||||
|
||||
// 收集结束标识
|
||||
waitCh := make(chan int)
|
||||
// 循环将所有需采集的页码写入 ch
|
||||
for i := 1; i <= capacity; i++ {
|
||||
ch <- i
|
||||
}
|
||||
close(ch)
|
||||
// 开启 MAXGoroutine 数量的协程, 如果分页页数小于设定的最大线程数, 则将线程数设置为1
|
||||
var GoroutineNum = config.MAXGoroutine
|
||||
if capacity < GoroutineNum*5 {
|
||||
GoroutineNum = 1
|
||||
}
|
||||
// 如果满足开启并发的条件, 则开启GoroutineNum数量的协程进行并发采集
|
||||
for i := 0; i < GoroutineNum; i++ {
|
||||
go func() {
|
||||
defer func() { waitCh <- 0 }()
|
||||
for {
|
||||
// 从channel中获取 pageNumber
|
||||
pg, ok := <-ch
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
// 执行对应的采集方法, 并发时不同使用同一个requestInfo
|
||||
requestInfo := util.CopyRequestInfo(r)
|
||||
requestInfo.Params.Set("pg", fmt.Sprint(pg))
|
||||
collectFunc(s, requestInfo)
|
||||
}
|
||||
}()
|
||||
}
|
||||
// 等待所有协程执行完毕
|
||||
for i := 0; i < GoroutineNum; i++ {
|
||||
<-waitCh
|
||||
}
|
||||
}
|
||||
|
||||
// BatchCollect 批量采集, 采集指定的所有站点最近x小时内更新的数据
|
||||
func BatchCollect(h int, ids ...string) {
|
||||
for _, id := range ids {
|
||||
@@ -242,7 +498,7 @@ func BatchCollect(h int, ids ...string) {
|
||||
if fs := system.FindCollectSourceById(id); fs != nil && fs.State {
|
||||
// 采用协程并发执行, 每个站点单独开启一个协程执行
|
||||
go func() {
|
||||
err := HandleCollect(fs.Id, h)
|
||||
err := HandleCollectRefine(fs.Id, h)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
@@ -261,7 +517,7 @@ func AutoCollect(h int) {
|
||||
for _, s := range system.GetCollectSourceList() {
|
||||
// 如果当前站点为启用状态 则执行 HandleCollect 进行数据采集
|
||||
if s.State {
|
||||
if err := HandleCollect(s.Id, h); err != nil {
|
||||
if err := HandleCollectRefine(s.Id, h); err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user