python - 如何根据文本中的关键字将一个html页面拆分为多个html

标签 python html regex beautifulsoup urllib

我想根据关键字PART将单个html文件拆分为多个html文件。给定的 html 文件包含提及四个部分的文本 - 第 I 部分、第 II 部分、第 III 部分和第 IV 部分。

我想将 html 分成 5 部分:

  • 第 0 部分 - 应包含从 html 开头到第 I 部分之前的文本
  • 第 I 部分 - 应包含从第 I 部分开始到第 II 部分之前的文本
  • 第 II 部分 - 应包含从第 II 部分开始到第 III 部分之前的文本
  • 第 III 部分 - 应包含从第 III 部分开始到第 IV 部分之前的文本
  • 第 IV 部分 - 应包含从第 IV 部分开始到结束的文本。

这些是一些示例 html 文件:

https://www.sec.gov/Archives/edgar/data/763744/000076374419000018/lcii-20181231.htm https://www.sec.gov/Archives/edgar/data/820027/000082002719000010/amp12312018.htm

请引用下面我的代码:

import sys
import re
from bs4 import BeautifulSoup
import os
import numpy as np
from urllib.request import urlopen
import pandas as pd
list_values_page_number=[]


type_parts = ['PART 0','PART I','PART II','PART III','PART IV']
output_path = r"D:\Tasks\10K\SEGMENTATION\2_segmentation"
input_files = ['https://www.sec.gov/Archives/edgar/data/763744/000076374419000018/lcii-20181231.htm',
              'https://www.sec.gov/Archives/edgar/data/820027/000082002719000010/amp12312018.htm']
input_folder = r'D:\Tasks\10K\input_files'
#content_segmentation_file_name = '/home/mobius365/Downloads/10-K_financial_documents/content_segmentation.csv'

#co_ent_nbr_links = dict(zip(list(input_data_frame["CO_Ent_Nbr"]),list(input_data_frame["Updated_Links"])))


def page_segmentation(list_of_content,prev_index, page_number):
    global Part_page_number
    global previous_index
    global count
    global store_index_list
    global output_file_storage_folder
    global file_content_prettified_list
    global part_repeat_storage_list
    global indices
    page_soup = BeautifulSoup(" ".join(list_of_content), "lxml")
    values_with_part=page_soup.findAll(text=re.compile("Par|PAR|ART"))
    list_of_values=[]
    values_with_part=[values_list.strip() for values_list in values_with_part] 
    for Part_values in values_with_part:
        if (("ART" in Part_values.strip()[:5] or "art" in Part_values.strip()[:5] ) and Part_values.strip()[-1] in ["I","V"] and len(Part_values)<9):
            list_of_values.append(Part_values)
        elif(len(Part_values.strip())<6):
            list_of_values.append(Part_values)
        else:
            pass 

    if len(list_of_values) == 1 :
        values_parents_finder = page_soup.find(text=re.compile(list_of_values[0]))
        parent_0_value = values_parents_finder.findParents()[0].text.strip().upper()
        parent_1_value = values_parents_finder.findParents()[1].text.strip().upper()
        parent_0_value = parent_0_value.replace(u'\xa0', u' ')
        parent_1_value = parent_1_value.replace(u'\xa0', u' ')
        parent_0_value = re.sub(' +', '',parent_0_value)
        parent_1_value = re.sub(' +', '',parent_1_value)
        if ((parent_0_value[0]=='P' and  parent_0_value[-1] in ["I","V"]) or (parent_1_value[0]=='P' and  (parent_1_value[-1] in ["I","V"] or parent_1_value[-2:] in ["I.","V."] ))):

            if(parent_0_value[:4].upper()=='PART' and  (parent_0_value[-1] in ["I","V"] or parent_0_value[-2:] in ["I.","V."])):
                temp_name=re.sub('t', 't ',parent_0_value)
                temp_name=re.sub('T', 'T ',parent_0_value)
            else:
                temp_name=re.sub('t', 't ',parent_1_value)
                temp_name=re.sub('T', 'T ',parent_1_value)  

            if (temp_name not in part_repeat_storage_list):
                part_repeat_storage_list.append(temp_name)
                Part_page_number[temp_name.upper()] = page_number
                next_level_index = prev_index
                with open(output_file_storage_folder+"/"+type_parts[count]+".html", "w",encoding='utf-8') as file:
                    file.write(" ".join(file_content_prettified_list[previous_index:next_level_index]))
                file.close()
                store_index_list.append((previous_index,next_level_index))
                previous_index = next_level_index
                count+=1
        else:
            pass
    elif len(list_of_values) == 2 :
        for two_values in list_of_values :
            values_parents_finder = page_soup.find(text = re.compile(two_values[0]))
            parent_0_value = values_parents_finder.findParents()[0].text.strip().upper()
            parent_1_value = values_parents_finder.findParents()[1].text.strip().upper()
            parent_0_value = parent_0_value.replace(u'\xa0', u' ')
            parent_1_value = parent_1_value.replace(u'\xa0', u' ')
            parent_0_value = re.sub(' +', '',parent_0_value)
            parent_1_value = re.sub(' +', '',parent_1_value)
            if ((parent_0_value[0]=='P' and  parent_0_value[-1] in ["I","V"]) or (parent_1_value[0]=='P' and  (parent_1_value[-1] in ["I","V"] or parent_1_value[-2:] in ["I.","V."]))):
                if(parent_0_value[:4].upper()=='PART' and  parent_0_value[-1] in ["I","V"] ):
                    temp_name=re.sub('t', 't ',parent_0_value)
                    temp_name=re.sub('T', 'T ',parent_0_value)
                else:
                    temp_name=re.sub('t', 't ',parent_1_value)
                    temp_name=re.sub('T', 'T ',parent_1_value)  
                if (temp_name not in part_repeat_storage_list):

                    part_repeat_storage_list.append(temp_name)
                    next_level_index = prev_index
                    Part_page_number[temp_name.upper()] = page_number
                    with open(output_file_storage_folder+"/"+type_parts[count]+".html", "w",encoding='utf-8') as file:
                        file.write(" ".join(file_content_prettified_list[previous_index:indices[indices.index(next_level_index)+1]]))
                    file.close()
                    store_index_list.append((previous_index,next_level_index))
                    previous_index = next_level_index
                    count+=1



for link in input_files:
    html = urlopen(link).read().decode('utf-8')
    name = link.split('/')[-1]
    with open(input_folder+"/"+name, 'w', encoding='utf-8') as f:
        f.write(html)
    f.close()



for links in input_files:
    files = links.split("/")[-1]
    file_name = os.path.join(input_folder,files)
    print (file_name)
    output_file_storage_folder = os.path.join(output_path,files)
    if not os.path.exists(output_file_storage_folder):
        os.makedirs(output_file_storage_folder)    
    try:
        file_content_reading = open(file_name, encoding="utf8").read()
    except Exception as e:
        print(e)
    file_content_bs = BeautifulSoup(file_content_reading, 'lxml')
    file_content_prettified_list = file_content_bs.prettify().split("\n")
    file_content_space_removed = [tags_values.strip() for tags_values in file_content_prettified_list]

    page_splits = file_content_bs.find_all(attrs={'style': re.compile('page-break-before|page-break-after',re.IGNORECASE)})
    if (len(page_splits)< 90 ):
        page_splits=page_splits
        indices = [index_number for index_number, html_tags in enumerate(file_content_space_removed) if ('page-break-after' in html_tags.lower() or 'page-break-before' in html_tags.lower())]
    else:
        page_splits=[tag_value for tag_value in page_splits if str(tag_value)[:2]!="<p"]
        indices = [index_number for index_number, html_tags in enumerate(file_content_space_removed) if ('page-break-after' in html_tags.lower() or 'page-break-before' in html_tags.lower())]

    type_parts=['PART 0','PART I','PART II','PART III','PART IV']
    previous_index=0
    store_index_list=[]
    part_repeat_storage_list=[]
    count=0

    Part_page_number = { "PART 0" : 0, "PART I" : np.nan, "PART II" : np.nan , "PART III" : np.nan , "PART IV" : np.nan }

    prev_index=0
    count_page_number=1

    for index_value in indices:
        next_index = index_value
        page_segmentation(file_content_space_removed[prev_index:index_value],prev_index,count_page_number)
        prev_index = next_index
        count_page_number+=1
    page_segmentation(file_content_space_removed[next_index:],prev_index,count_page_number)

    if(len(store_index_list)!=0):
        with open(output_file_storage_folder+"/"+type_parts[count]+".html", "w",encoding='utf-8') as file:
            file.write(" ".join(file_content_prettified_list[store_index_list[-1][-1]:]))
        file.close()   
    else:
        with open(output_file_storage_folder+"/"+type_parts[count]+".html", "w",encoding='utf-8') as file:
            file.write(" ".join(file_content_prettified_list[:]))
        file.close()

    Part_page_number['File_Name']=files
    list_values_page_number.append(Part_page_number)

    df_summary = pd.DataFrame(list_values_page_number)
    df_summary.to_excel("summary_10K_Page_Segmentation.xlsx",index=False)

从上面的代码中,我无法按照我的意愿分割 html 文件。

编辑:

我添加了一组新的网址。

https://www.sec.gov/Archives/edgar/data/887921/000088792119000004/rev201810-k.htm https://www.sec.gov/Archives/edgar/data/104918/000010491819000053/ava-20181231x10k.htm https://www.sec.gov/Archives/edgar/data/886982/000119312519050198/d669877d10k.htm https://www.sec.gov/Archives/edgar/data/878927/000156459019004755/odfl-10k_20181231.htm https://www.sec.gov/Archives/edgar/data/785161/000078516119000011/ehc10k123118.htm https://www.sec.gov/Archives/edgar/data/1393818/000119312519061011/d663205d10k.htm https://www.sec.gov/Archives/edgar/data/86521/000008652119000014/sre20181231form10k.htm https://www.sec.gov/Archives/edgar/data/76282/000007628219000021/pkoh20181231-10k.htm https://www.sec.gov/Archives/edgar/data/883237/000088323719000026/vrts1231201810-k.htm https://www.sec.gov/Archives/edgar/data/883945/000088394519000016/usak-20181231.htm https://www.sec.gov/Archives/edgar/data/1000623/000100062319000048/swmform10-k12312018.htm

最佳答案

嗯,我写得很快,但是很复杂。

我会解释一下代码。

  1. 拆分为分隔页面的元素 ( <hr style="page-break-after:always"></hr> )。
  2. 在分割的页面中找到代表 PART 的文本并将内容组合起来。
  3. 保存。

我将粘贴代码。 我希望这段代码有帮助

import requests
from bs4 import BeautifulSoup

response = requests.get("https://www.sec.gov/Archives/edgar/data/763744/000076374419000018/lcii-20181231.htm", verify=False)
file_content_reading = response.text

splited_pages = file_content_reading.split('<hr style="page-break-after:always"></hr>')

skip_words = ['INDEX']

part_strings = [['PART I', 'PART I.', 'PART I. '],['PART II', 'PART II.', 'PART II. '],['PART III', 'PART III.', 'PART III. '],['PART IV', 'PART IV.', 'PART IV. ']]

part_content_list = []
appned_content = ""
part = 0

def maching_result(content_soup, list_string):
    result = None
    for match_string in list_string:
        if content_soup.find("span", text=match_string) is not None:
            result = match_string
            break
    return result

for page in splited_pages:
    content = BeautifulSoup(page, "lxml")
    if(part < len(part_strings)) and maching_result(content, skip_words) is None:

        output = maching_result(content, part_strings[part])
        if maching_result(content, part_strings[part]) is not None:
            part += 1
            index = page.find(str(content.find("span", text=output)))
            first = page[:index]
            second = page[index:]
            part_content_list.append(appned_content + first)
            appned_content = second + page
        else:
            appned_content += page + '<hr style="page-break-after:always"></hr>'
    else:
        appned_content += page

part_content_list.append(appned_content)

num = 0
for part in part_content_list:
    soup = BeautifulSoup(part,"lxml")
    with open("output"+ str(num)+".html", "w", encoding="utf-8") as file:
        file.write(str(soup))
    num +=1

关于python - 如何根据文本中的关键字将一个html页面拆分为多个html,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59550860/

相关文章:

html - 从 YouTube-DL 自动生成 HTML5 <video> 标签

html - div 元素未填充到其容器 div 的高度

python - 用两个固定长度的数字对正则表达式进行分组并删除其前导零

java - 用正则表达式检查数字是否匹配

python - 在脚本中导入 numpy 语句

python - 使用函数装饰器从集合中删除一个值

python - 如何在 Fabric 中使用额外的 env.roledefs 键?

python - 如何检查相同的Python脚本是否在小型网络中的另一台计算机上运行?

html - 按照图像中显示的顺序排列 div

c# - 使用正则表达式常量返回两个标签之间的字符串