diff --git a/kirinjewelry2006视频spider.py b/kirinjewelry2006视频spider.py new file mode 100644 index 0000000..88ace2f --- /dev/null +++ b/kirinjewelry2006视频spider.py @@ -0,0 +1,48 @@ +import requests +import parsel +from tqdm import tqdm +import csv +import time + +header = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" +} + + +def get_vidio(pageurl): + # pageurl = "https://www.kirinjewelry2006.com/video/products-detail-967094" + response = requests.get(url=pageurl, headers=header) + if response.status_code == 200: + res_text = response.text + select = parsel.Selector(res_text) + vidio_name = select.xpath('//*[@id="v620a34694dad7"]//h1/text()').get().split("-")[-1] + # print(vidio_name) + vidio_url = select.xpath('//*[@id="v620a34694dad7"]//img/@video-src').get() + vidio_res = requests.get(url=vidio_url,headers=header).content + + with open('vidio\\{}.mp4'.format(vidio_name), 'wb') as file: + file.write(vidio_res) + print("{}视频已保存".format(vidio_name)) + + with open('vidio.csv', encoding="utf-8", mode='a', newline="") as f: + csv_writer = csv.writer(f) + csv_writer.writerow([vidio_name,]) + time.sleep(5) + else: + print("无法获取视频") + +for page in tqdm(range(2,87)): + print("正在爬取第{}页".format(page)) + url = "https://www.kirinjewelry2006.com/products-list-{}".format(page) + response = requests.get(url=url,headers=header).text + select = parsel.Selector(response) + + lis = select.xpath('//*[@id="v6204665765bf2"]/div/div[2]/div[2]//div[1]/ul/li') + for li in lis: + href = li.xpath('div/a/@href').get() + pageurl = "https://www.kirinjewelry2006.com" + href + # print(pageurl) + try: + get_vidio(pageurl) + except: + continue diff --git a/product-sku.py b/product-sku.py new file mode 100644 index 0000000..2cd605b --- /dev/null +++ b/product-sku.py @@ -0,0 +1,15 @@ +import os +import csv + + +sku_list = [] +with open("product_SKU.csv", "r", encoding="utf-8") as file: + for line in file: + sku_strlist = line.split(" ") + skustr = sku_strlist[-1].strip().replace("\"","") + sku_list.append(skustr) + +for sku in sku_list: + with open("sku_product.csv", mode="a", encoding="utf-8", newline="") as f: + csv_writer = csv.writer(f) + csv_writer.writerow([sku,]) diff --git a/潜在注册链接_http-molbiol-ru.py b/潜在注册链接_http-molbiol-ru.py new file mode 100644 index 0000000..3143bd3 --- /dev/null +++ b/潜在注册链接_http-molbiol-ru.py @@ -0,0 +1,23 @@ +import requests +import parsel +from tqdm import tqdm +import time +import csv + +header = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", + # "Cookie": "_bl_uid=g9ld3se62R4uz3xICmC3jksen8CO; member_id=1295895; pass_hash=60c367c69b5cd4b8ef6780634d5dd292; member_name=solaryusa; mqtids=%2C; session_id=4c04b006bf42205767eae453eb09c036; topicsread=a%3A1%3A%7Bi%3A358346%3Bi%3A1709013153%3B%7D" +} + +for page in tqdm(range(0,5652,50)): + time.sleep(5) + url = "http://molbiol.ru/forums/index.php?showtopic=358346&st={}".format(page) + response = requests.get(url=url,headers=header).text + select = parsel.Selector(response) + links = select.xpath('//*[@id="ipbwrapper"]/div[2]//noindex/a/@href').getall() + for link in links: + with open("molbiol-ru潜在注册链接.csv", mode="a", encoding="utf-8", newline="") as f: + csv_writer = csv.writer(f) + csv_writer.writerow([link,]) + +