今天我们来谈谈在Python爬虫及实际应用方面的一些小技巧。接下来,我将分享如何使用Python爬取wooyun镜像站的文章。
首先,我们需要编写两个程序:一个用于抓取图片并将其保存到本地,另一个用于抓取页面源代码。下面是这两个程序的代码:
1. 抓取图片并保存到本地的程序(wooyun_spider):
```python
#coding: utf-8
import requests
from bs4 import BeautifulSoup
import re
import os
def dopost(url):
r = requests.get(url, timeout=6)
if r.status_code == 200:
return r.content
return None
def bsparser(content):
soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
bqs = soup.find_all('a', href=re.compile(r'/wooyun\-\d+\-\d+.html'))
return bqs
dic = {}
if __name__ == '__main__':
if os.path.exists(r'd:\wooyun'):
print('d://wooyun dir exists...spider begin')
else:
os.mkdir(r'd:\wooyun')
print('mkdir d://wooyun...spider begin')
for i in range(1, 5):
ii = '%d' % i
url = 'http://wy.hxsec.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=' + ii
content = dopost(url)
if content:
bqs = bsparser(content)
for bq in bqs:
dic[bq.get_text()] = bq['href']
# 保存图片到本地的代码,例如:save_image(dic[bq.get_text()])
```
2. 抓取页面源代码的程序(save_page):
```python
#coding: utf-8
import requests
from bs4 import BeautifulSoup
import re
import os
def dopost(url):
r = requests.get(url, timeout=6)
if r.status_code == 200:
return r.content
return None
def bsparser(content):
soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
bqs = soup.find_all('a', href=re.compile(r'/wooyun-\d+\-\d+\.html'))
return bqs
dic = {}
if __name__ == '__main__':
wooyun_dir = r'd:\wooyun'
os.makedirs(wooyun_dir, exist_ok=True) # 确保目录存在,即使为空也可以创建成功
url = 'http://www.wooyun.org/' # 这里填写你想爬取的网站首页URL地址
content = dopost(url) # 这里填写你想爬取的网站首页URL地址对应的POST请求内容,如果不需要POST请求则去掉这一行注释或删除此行代码即可。注意替换为正确的请求参数。例如:data={"key": "value"} or params={"key": "value"}等根据实际情况调整。如果需要获取登录后的session信息可以参考requests库的相关文档进行实现。如果不需要POST请求可以直接使用dopost函数传入url参数进行GET请求。
```python# 遍历字典的键值对
for (k, v) in dic.items():
try:
con = dopost(v)
file_str = 'd://wooyun//' + k.encode('gbk').replace(r'/' or r'|', '') + '.html'
with open(file_str, 'w') as f:
f.writelines(con)
print(k + ' is ok')
except:
print(k + ' is error')
pass
dic.clear()
# 下载图片并保存到本地
def bsparser_img(content):
soup = BeautifulSoup(content, 'lxml', from_encoding='utf-8')
imgs = soup.find_all('img', src=re.compile(r'full/\w+\.jpg'))
return imgs
dic = {}
```
首先,我们需要将原始代码进行重构,将其分解为几个函数。我们可以创建一个名为`main`的函数,用于执行主要逻辑。其他辅助函数,如`check_directory`、`dopost`、`bsparser`和`bsparser_img`,将根据其功能进行命名。然后,我们可以将这些函数组织在一个类中,以便更好地管理它们。
```python
import os
class WooyunSpider:
def __init__(self):
self.dic = {}
def check_directory(self):
if os.path.exists(r'd:\wooyun\full'):
pass
else:
os.mkdir(r'd:wooyun\full')
print('img_spider begin')
def dopost(self, url):
# 实现dopost功能
pass
def bsparser(self, content):
# 实现bsparser功能
pass
def bsparser_img(self, con):
# 实现bsparser_img功能
pass
def run(self):
for i in range(1, 5):
url = 'http://wy.hxsec.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=1'
content = self.dopost(url)
bqs = self.bsparser(content)
for bq in bqs:
self.dic[bq.get_text()] = bq['href']
for (k, v) in self.dic.items():
try:
con = self.dopost(v)
except:
print(v + ' err')
continue
try:
imgs = self.bsparser_img(con)
for img in imgs:
img_con = self.dopost('http://static.hx99.net/static/bugs/' + img['src'])
file_str = 'd://wooyun/' + img['src']
with open(file_str, 'wb') as f:
f.write(img_con)
except:
print(img['src'] + ' err')
self.dic.clear()
if __name__ == '__main__':
spider = WooyunSpider()
spider.run()
```
这样,我们就将原始代码重构为了一个类,并将一些辅助函数移到了类中。这使得代码更加结构化和易于维护。
请根据提供的内容完成内容重构,并保持段落结构:
图片1:wooyun_spider2.PNG (213.01 KB, 下载次数: 41)
上传时间:2017-2-2 12:31
打开html文件后,我们发现图片2:wooyun_spider5.PNG (144.89 KB, 下载次数: 41) 是失效的。由于本地没有这些图片,我们需要运行第二个程序来抓取这些图片。最终得到的图片3:wooyun_spider3.PNG (325.23 KB, 下载次数: 49) 是有效的。现在打开html文件,可以看到图片4:wooyun_spider4.PNG (661.76 KB, 下载次数: 40)。虽然程序只抓取了4页,但理论上可以抓取所有文章。不过这个过程可能会比较慢。
二、Python监听&模拟鼠标
以下是使用winapi编程和pyhook实现的源码。运行此程序后,鼠标滚轮滚动将产生类似于鼠标左右键的效果(即滚轮向上滚动相当于按下鼠标左键再松开,向下滚动相当于按下鼠标右键再松开)。当然,你可以根据需要进行修改。例如,如果你想在玩游戏时让鼠标不停地按右键,可以使用此程序将滚轮映射为鼠标右键,实现作弊功能。
```python
# -*- coding: utf-8 -*-
import pythoncom
import pyHook
import time
import win32api
import win32con
def onMouseWheel(event):
if event.Wheel == 1:
win32api.mouse_event(win32con.MOUSEEVENTF_LEFTDOWN, 0, 0)
time.sleep(0.05)
win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP, 0, 0)
elif event.Wheel == -1:
win32api.mouse_event(win32con.MOUSEEVENTF_RIGHTDOWN, 0, 0)
time.sleep(0.05)
win32api.mouse_event(win32con.MOUSEEVENTF_RIGHTUP, 0, 0)
return True
```
请根据提供的内容完成内容重构,并保持段落结构:
```python
# 1. 使用pyHook让你的qq窗口抖一抖
import pyHook
import pythoncom
import ctypes
import random
import win32con
import win32gui
class xyz(ctypes.Structure):
_fields_ = [('left', ctypes.c_int),
('top', ctypes.c_int),
('right', ctypes.c_int),
('bottom', ctypes.c_int)]
user32 = ctypes.windll.user32
HWND = win32gui.FindWindow("TXGuiFoundation", None)
xyzt = xyz()
user32.GetWindowRect(HWND, ctypes.byref(xyzt))
for i in range(2, 200):
user32.SetWindowPos(HWND, None, xyzt.left + random.randint(1, i), xyzt.top - random.randint(1, i),
xyzt.right - xyzt.left, xyzt.bottom - xyzt.top, win32con.SWP_SHOWWINDOW)
user32.SetWindowPos(HWND, None, xyzt.left, xyzt.top, xyzt.right - xyzt.left, xyzt.bottom - xyzt.top, win32con.SWP_SHOWWINDOW)
# 2. 让QQ窗口抖一抖的恶作剧代码
#coding: utf-8
import ctypes
import random
import win32con
import win32gui
class HookManager(pyHook.HookManager):
def __init__(self):
pyHook.HookManager.__init__(self)
self.MouseWheelCallBack = self.on_mouse_wheel
self.hooked = False
self.MouseAll = pyHook.HookCallback(self.on_mouse_all)
self.hook()
hm = self.HookMouse()
pythoncom.PumpMessages()
hm.UnhookMouse()
hm.UnhookAll()
self.unhook()
def on_mouse_wheel(self, nCode, wParam, lParam): # mouse wheel message hook callback function code here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # int nCode; // the hook code passed to the called function # WPARAM wParam; // pointer to a WPARAM structure or one of its variants # LPARAM lParam; // pointer to a LPARAM structure or one of its variants # if nCode >= 0 and wParam == mouseWheel then return True end if next def on_mouse_all(self, event): # all messages hook callback function code here!!!!!!!!!!!!!!!!!!!!!!! # int event; // the type of event # WPARAM wParam; // pointer to a WPARAM structure or one of its variants # LPARAM lParam; // pointer to a LPARAM structure or one of its variants # if event <> MouseEvent or not self.hooked then return next if event == MouseAll then return False end if if event == MOUSEWHEEL then self.MouseWheelCallBack(nCode, wParam, lParam) end if next return True end def hook(): # the method that sets up the hooks # this is where you set your hooks # # self.hooked = True # return self # # HookManager hook = HookManager() # hook() #def unhook(): # the method that removes the hooks # this is where you remove your hooks # # self.hooked = False # return self # # HookManager hooker = HookManager() # hooker() #if __name__ == '__main__': # hook() #pythoncom.PumpMessages() #end ======================================================================[ The End ]================================================================
以下是重构后的内容:
四、总结
我将模拟鼠标和抖动QQ主窗口的功能打包成了一个exe文件,只需双击即可运行。需要注意的是,模拟鼠标的exe文件与源代码略有不同,它仅模拟了鼠标左键操作,而源代码则同时模拟了左右键操作。如果你对此感兴趣,可以自行下载源代码并将其打包成exe文件进行尝试。
链接:http://pan.baidu.com/s/1boTZPnx
祝大家新年快乐!现在感觉有些困了,去睡一觉吧。