今天我们来谈谈在Python爬虫及实际应用方面的一些小技巧。接下来,我将分享如何使用Python爬取wooyun镜像站的文章。

首先,我们需要编写两个程序:一个用于抓取图片并将其保存到本地,另一个用于抓取页面源代码。下面是这两个程序的代码:

1. 抓取图片并保存到本地的程序(wooyun_spider):

```python

#coding: utf-8

import requests

from bs4 import BeautifulSoup

import re

import os

def dopost(url):

r = requests.get(url, timeout=6)

if r.status_code == 200:

return r.content

return None

def bsparser(content):

soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')

bqs = soup.find_all('a', href=re.compile(r'/wooyun\-\d+\-\d+.html'))

return bqs

dic = {}

if __name__ == '__main__':

if os.path.exists(r'd:\wooyun'):

print('d://wooyun dir exists...spider begin')

else:

os.mkdir(r'd:\wooyun')

print('mkdir d://wooyun...spider begin')

for i in range(1, 5):

ii = '%d' % i

url = 'http://wy.hxsec.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=' + ii

content = dopost(url)

if content:

bqs = bsparser(content)

for bq in bqs:

dic[bq.get_text()] = bq['href']

# 保存图片到本地的代码,例如:save_image(dic[bq.get_text()])

```

2. 抓取页面源代码的程序(save_page):

```python

#coding: utf-8

import requests

from bs4 import BeautifulSoup

import re

import os

def dopost(url):

r = requests.get(url, timeout=6)

if r.status_code == 200:

return r.content

return None

def bsparser(content):

soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')

bqs = soup.find_all('a', href=re.compile(r'/wooyun-\d+\-\d+\.html'))

return bqs

dic = {}

if __name__ == '__main__':

wooyun_dir = r'd:\wooyun'

os.makedirs(wooyun_dir, exist_ok=True) # 确保目录存在,即使为空也可以创建成功

url = 'http://www.wooyun.org/' # 这里填写你想爬取的网站首页URL地址

content = dopost(url) # 这里填写你想爬取的网站首页URL地址对应的POST请求内容,如果不需要POST请求则去掉这一行注释或删除此行代码即可。注意替换为正确的请求参数。例如:data={"key": "value"} or params={"key": "value"}等根据实际情况调整。如果需要获取登录后的session信息可以参考requests库的相关文档进行实现。如果不需要POST请求可以直接使用dopost函数传入url参数进行GET请求。

```python# 遍历字典的键值对

for (k, v) in dic.items():

try:

con = dopost(v)

file_str = 'd://wooyun//' + k.encode('gbk').replace(r'/' or r'|', '') + '.html'

with open(file_str, 'w') as f:

f.writelines(con)

print(k + ' is ok')

except:

print(k + ' is error')

pass

dic.clear()

# 下载图片并保存到本地

def bsparser_img(content):

soup = BeautifulSoup(content, 'lxml', from_encoding='utf-8')

imgs = soup.find_all('img', src=re.compile(r'full/\w+\.jpg'))

return imgs

dic = {}

```

首先,我们需要将原始代码进行重构,将其分解为几个函数。我们可以创建一个名为`main`的函数,用于执行主要逻辑。其他辅助函数,如`check_directory`、`dopost`、`bsparser`和`bsparser_img`,将根据其功能进行命名。然后,我们可以将这些函数组织在一个类中,以便更好地管理它们。

```python

import os

class WooyunSpider:

def __init__(self):

self.dic = {}

def check_directory(self):

if os.path.exists(r'd:\wooyun\full'):

pass

else:

os.mkdir(r'd:wooyun\full')

print('img_spider begin')

def dopost(self, url):

# 实现dopost功能

pass

def bsparser(self, content):

# 实现bsparser功能

pass

def bsparser_img(self, con):

# 实现bsparser_img功能

pass

def run(self):

for i in range(1, 5):

url = 'http://wy.hxsec.com/search?keywords=&&content_search_by=by_bugs&&search_by_html=False&&page=1'

content = self.dopost(url)

bqs = self.bsparser(content)

for bq in bqs:

self.dic[bq.get_text()] = bq['href']

for (k, v) in self.dic.items():

try:

con = self.dopost(v)

except:

print(v + ' err')

continue

try:

imgs = self.bsparser_img(con)

for img in imgs:

img_con = self.dopost('http://static.hx99.net/static/bugs/' + img['src'])

file_str = 'd://wooyun/' + img['src']

with open(file_str, 'wb') as f:

f.write(img_con)

except:

print(img['src'] + ' err')

self.dic.clear()

if __name__ == '__main__':

spider = WooyunSpider()

spider.run()

```

这样,我们就将原始代码重构为了一个类,并将一些辅助函数移到了类中。这使得代码更加结构化和易于维护。

请根据提供的内容完成内容重构,并保持段落结构:

图片1:wooyun_spider2.PNG (213.01 KB, 下载次数: 41)

上传时间:2017-2-2 12:31

打开html文件后,我们发现图片2:wooyun_spider5.PNG (144.89 KB, 下载次数: 41) 是失效的。由于本地没有这些图片,我们需要运行第二个程序来抓取这些图片。最终得到的图片3:wooyun_spider3.PNG (325.23 KB, 下载次数: 49) 是有效的。现在打开html文件,可以看到图片4:wooyun_spider4.PNG (661.76 KB, 下载次数: 40)。虽然程序只抓取了4页,但理论上可以抓取所有文章。不过这个过程可能会比较慢。

二、Python监听&模拟鼠标

以下是使用winapi编程和pyhook实现的源码。运行此程序后,鼠标滚轮滚动将产生类似于鼠标左右键的效果(即滚轮向上滚动相当于按下鼠标左键再松开,向下滚动相当于按下鼠标右键再松开)。当然,你可以根据需要进行修改。例如,如果你想在玩游戏时让鼠标不停地按右键,可以使用此程序将滚轮映射为鼠标右键,实现作弊功能。

```python

# -*- coding: utf-8 -*-

import pythoncom

import pyHook

import time

import win32api

import win32con

def onMouseWheel(event):

if event.Wheel == 1:

win32api.mouse_event(win32con.MOUSEEVENTF_LEFTDOWN, 0, 0)

time.sleep(0.05)

win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP, 0, 0)

elif event.Wheel == -1:

win32api.mouse_event(win32con.MOUSEEVENTF_RIGHTDOWN, 0, 0)

time.sleep(0.05)

win32api.mouse_event(win32con.MOUSEEVENTF_RIGHTUP, 0, 0)

return True

```

请根据提供的内容完成内容重构,并保持段落结构:

```python

# 1. 使用pyHook让你的qq窗口抖一抖

import pyHook

import pythoncom

import ctypes

import random

import win32con

import win32gui

class xyz(ctypes.Structure):

_fields_ = [('left', ctypes.c_int),

('top', ctypes.c_int),

('right', ctypes.c_int),

('bottom', ctypes.c_int)]

user32 = ctypes.windll.user32

HWND = win32gui.FindWindow("TXGuiFoundation", None)

xyzt = xyz()

user32.GetWindowRect(HWND, ctypes.byref(xyzt))

for i in range(2, 200):

user32.SetWindowPos(HWND, None, xyzt.left + random.randint(1, i), xyzt.top - random.randint(1, i),

xyzt.right - xyzt.left, xyzt.bottom - xyzt.top, win32con.SWP_SHOWWINDOW)

user32.SetWindowPos(HWND, None, xyzt.left, xyzt.top, xyzt.right - xyzt.left, xyzt.bottom - xyzt.top, win32con.SWP_SHOWWINDOW)

# 2. 让QQ窗口抖一抖的恶作剧代码

#coding: utf-8

import ctypes

import random

import win32con

import win32gui

class HookManager(pyHook.HookManager):

def __init__(self):

pyHook.HookManager.__init__(self)

self.MouseWheelCallBack = self.on_mouse_wheel

self.hooked = False

self.MouseAll = pyHook.HookCallback(self.on_mouse_all)

self.hook()

hm = self.HookMouse()

pythoncom.PumpMessages()

hm.UnhookMouse()

hm.UnhookAll()

self.unhook()

def on_mouse_wheel(self, nCode, wParam, lParam): # mouse wheel message hook callback function code here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # int nCode; // the hook code passed to the called function # WPARAM wParam; // pointer to a WPARAM structure or one of its variants # LPARAM lParam; // pointer to a LPARAM structure or one of its variants # if nCode >= 0 and wParam == mouseWheel then return True end if next def on_mouse_all(self, event): # all messages hook callback function code here!!!!!!!!!!!!!!!!!!!!!!! # int event; // the type of event # WPARAM wParam; // pointer to a WPARAM structure or one of its variants # LPARAM lParam; // pointer to a LPARAM structure or one of its variants # if event <> MouseEvent or not self.hooked then return next if event == MouseAll then return False end if if event == MOUSEWHEEL then self.MouseWheelCallBack(nCode, wParam, lParam) end if next return True end def hook(): # the method that sets up the hooks # this is where you set your hooks # # self.hooked = True # return self # # HookManager hook = HookManager() # hook() #def unhook(): # the method that removes the hooks # this is where you remove your hooks # # self.hooked = False # return self # # HookManager hooker = HookManager() # hooker() #if __name__ == '__main__': # hook() #pythoncom.PumpMessages() #end ======================================================================[ The End ]================================================================

以下是重构后的内容:

四、总结

我将模拟鼠标和抖动QQ主窗口的功能打包成了一个exe文件,只需双击即可运行。需要注意的是,模拟鼠标的exe文件与源代码略有不同,它仅模拟了鼠标左键操作,而源代码则同时模拟了左右键操作。如果你对此感兴趣,可以自行下载源代码并将其打包成exe文件进行尝试。

链接:http://pan.baidu.com/s/1boTZPnx

祝大家新年快乐!现在感觉有些困了,去睡一觉吧。