python - 为什么我的链接没有写入我的文件

标签 python python-3.x web web-crawler depth-first-search

import urllib
from bs4 import BeautifulSoup
import requests
import readability
import time
import http.client

seed_url = "https://en.wikipedia.org/wiki/Sustainable_energy"
root_url = "https://en.wikipedia.org"
max_limit=5
#file = open("file_crawled.txt", "w")

def get_urls(seed_url):
r = requests.get(seed_url)
soup = BeautifulSoup(r.content,"html.parser")
links = soup.findAll('a', href=True)
valid_links=[]
 for links in links:
   if 'wiki' in links['href'] and '.' not in links['href'] and ':' not in links['href'] and '#' not in links['href']:
     valid_links.append(root_url + links['href'])
 return valid_links


visited=[]
def crawl_dfs(seed_url, max_depth):
depth=1
file1 = open("file_crawled.txt", "w+")
visited.append(root_url)
   if depth<=max_depth:
      children=get_urls(seed_url)
      for child in children:
           if child not in visited:          
                file1.write(child)                                    
                time.sleep(1)
                visited.append(child)
                crawl_dfs(child,max_depth-1)
   file1.close()

crawl_dfs(seed_url,max_limit)

dfs爬取使用python 3.6 帮助我编写代码,请纠正我的错误,我的爬网链接没有写入名为 file1.txt 的文件。我不知道为什么我最后尝试了一切

最佳答案

您只需打开和关闭文件一次 - 在第一次 crawl_dfs() 之前打开并在第一次 crawl_dfs() 之后关闭

测试:

import urllib
from bs4 import BeautifulSoup
import requests
#import readability
import time
import http.client

# --- functions ---

def get_urls(seed_url):
    r = requests.get(seed_url)
    soup = BeautifulSoup(r.content,"html.parser")
    links = soup.findAll('a', href=True)
    valid_links = []
    for links in links:
        if 'wiki' in links['href'] and '.' not in links['href'] and ':' not in links['href'] and '#' not in links['href']:
            valid_links.append(root_url + links['href'])
    return valid_links


def crawl_dfs(seed_url, max_depth, file_out):
    if max_depth >= 1:
       children = get_urls(seed_url)
       for child in children:
           if child not in visited:          
               file_out.write(child + "\n")                                    
               #time.sleep(1)
               visited.append(child)
               crawl_dfs(child, max_depth-1, file_out)

# --- main ---

seed_url = "https://en.wikipedia.org/wiki/Sustainable_energy"
root_url = "https://en.wikipedia.org"
max_limit = 1

visited=[root_url]

file1 = open("file_crawled.txt", "w+")

crawl_dfs(seed_url, max_limit, file1)

file1.close()

关于python - 为什么我的链接没有写入我的文件,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/39843798/

相关文章:

python - 如何在 kubeflow 管道中传递环境变量?

总是至少执行一次的 Python 循环?

python - python : java. lang.reflect.InaccessibleObjectException 上的 Apache-Spark 错误

python - 使用 tf.estimator 提前停止,如何?

python - Pandas 中的日期时间转换问题

git - 由于 Composer.lock,Laravel Forge 部署失败

html - 我的整个正文和页脚都变成了来 self 的导航栏的链接 (html/css)

python - 将终端输出重定向到 tkinter

python-3.x - 我想计算数据帧中列中重复值的出现次数并更新 python 中新列中的计数

javascript - 网页偶尔显示507存储空间不足