import urllib
from bs4 import BeautifulSoup
import requests
import readability
import time
import http.client
seed_url = "https://en.wikipedia.org/wiki/Sustainable_energy"
root_url = "https://en.wikipedia.org"
max_limit=5
#file = open("file_crawled.txt", "w")
def get_urls(seed_url):
r = requests.get(seed_url)
soup = BeautifulSoup(r.content,"html.parser")
links = soup.findAll('a', href=True)
valid_links=[]
for links in links:
if 'wiki' in links['href'] and '.' not in links['href'] and ':' not in links['href'] and '#' not in links['href']:
valid_links.append(root_url + links['href'])
return valid_links
visited=[]
def crawl_dfs(seed_url, max_depth):
depth=1
file1 = open("file_crawled.txt", "w+")
visited.append(root_url)
if depth<=max_depth:
children=get_urls(seed_url)
for child in children:
if child not in visited:
file1.write(child)
time.sleep(1)
visited.append(child)
crawl_dfs(child,max_depth-1)
file1.close()
crawl_dfs(seed_url,max_limit)
dfs爬取使用python 3.6 帮助我编写代码,请纠正我的错误,我的爬网链接没有写入名为 file1.txt 的文件。我不知道为什么我最后尝试了一切
最佳答案
您只需打开和关闭文件一次 - 在第一次 crawl_dfs()
之前打开并在第一次 crawl_dfs()
之后关闭
测试:
import urllib
from bs4 import BeautifulSoup
import requests
#import readability
import time
import http.client
# --- functions ---
def get_urls(seed_url):
r = requests.get(seed_url)
soup = BeautifulSoup(r.content,"html.parser")
links = soup.findAll('a', href=True)
valid_links = []
for links in links:
if 'wiki' in links['href'] and '.' not in links['href'] and ':' not in links['href'] and '#' not in links['href']:
valid_links.append(root_url + links['href'])
return valid_links
def crawl_dfs(seed_url, max_depth, file_out):
if max_depth >= 1:
children = get_urls(seed_url)
for child in children:
if child not in visited:
file_out.write(child + "\n")
#time.sleep(1)
visited.append(child)
crawl_dfs(child, max_depth-1, file_out)
# --- main ---
seed_url = "https://en.wikipedia.org/wiki/Sustainable_energy"
root_url = "https://en.wikipedia.org"
max_limit = 1
visited=[root_url]
file1 = open("file_crawled.txt", "w+")
crawl_dfs(seed_url, max_limit, file1)
file1.close()
关于python - 为什么我的链接没有写入我的文件,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/39843798/