php - 是否有可以清理内容的 PHP 类?

标签 php regex string oop

我一直在尝试使用一系列正则表达式和 PHP 函数 preg_replace 用 PHP 编写。

我的主要目标是整理内容,例如确保句子的开头是大写字母;逗号后有一个空格;等

我试图实现的一些整理示例:

// Remove any spaces around slashes
$content_replacements_from[] = "/\s*\/\s*/";
$content_replacements_to[] = "/";

// Remove any new lines or tabs
$content_replacements_from[] = "/[\r\n\t]/";
$content_replacements_to[] = " ";

// Remove any extra spaces
$content_replacements_from[] = "/\s{2,}/";
$content_replacements_to[] = " ";

// Tidy up joined full stops
$content_replacements_from[] = "/([a-zA-Z]{1})\s*[\.]{1}\s*([^(jpeg|jpg|png|pdf|gif|doc|xls|docx|xlsx|ppt|pptx|html|php|htm)]{1})/";
$content_replacements_to[] = "$1. $2";

// Tidy up joined commas
$content_replacements_from[] = "/([a-zA-Z0-9]{1})\s*[\,]{1}\s*([a-zA-Z0-9]{1})/";
$content_replacements_to[] = "$1, $2";

// Tidy up joined exclamation marks
$content_replacements_from[] = "/([a-zA-Z0-9]{1})\s*[\!]{1}\s*([a-zA-Z0-9]{1})/";
$content_replacements_to[] = "$1! $2";

// Tidy up joined question marks
$content_replacements_from[] = "/([a-zA-Z0-9]{1})\s*[\?]{1}\s*([a-zA-Z0-9]{1})/";
$content_replacements_to[] = "$1? $2";

// Tidy up joined semi colons
$content_replacements_from[] = "/([a-zA-Z0-9]{1})\s*[\;]{1}\s*([a-zA-Z0-9]{1})/";
$content_replacements_to[] = "$1; $2";

// Tidy up joined colons
$content_replacements_from[] = "/([a-zA-Z0-9]{1})\s*[\:]{1}\s*([a-zA-Z0-9]{1})/";
$content_replacements_to[] = "$1: $2";

// Tidy up fluid ounces
$content_replacements_from[] = "/[Ff]{1}[Ll]{1}.?\s?[Oo]{1}[Zz]{1}/";
$content_replacements_to[] = "fl oz";

// Tidy up rpm
$content_replacements_from[] = "/[Rr]{1}[Pp]{1}[Mm]{1}/";
$content_replacements_to[] = "rpm";

// Tidy up UK
$content_replacements_from[] = "/[Uu]{1}[Kk]{1}/";
$content_replacements_to[] = "UK";

// Tidy up Maxi-sense
$content_replacements_from[] = "/[Mm]{1}axi[\s\-]?[Ss]{1}ense/";
$content_replacements_to[] = "maxi-sense";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Mm]{1}axi[\s\-]?[Ss]{1}ense/";
$content_replacements_to[] = ". Maxi-sense";
$content_replacements_from[] = "/^[Mm]{1}axi[\s\-]?[Ss]{1}ense/";
$content_replacements_to[] = "Maxi-sense";

// Tidy up Side-by-side
$content_replacements_from[] = "/[Ss]{1}ide[\s\-]?[Bb]{1}y[\s\-]?[Ss]{1}ide/";
$content_replacements_to[] = "side-by-side";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Ss]{1}ide[\s\-]?[Bb]{1}y[\s\-]?[Ss]{1}ide/";
$content_replacements_to[] = ". Side-by-side";
$content_replacements_from[] = "/^[Ss]{1}ide[\s\-]?[Bb]{1}y[\s\-]?[Ss]{1}ide/";
$content_replacements_to[] = "Side-by-side";

// Tidy up extra large
$content_replacements_from[] = "/[Xx]{1}[Ll]{l}/";
$content_replacements_to[] = "extra large";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Xx]{1}[Ll]{l}/";
$content_replacements_to[] = "Extra large";
$content_replacements_from[] = "/^[Xx]{1}[Ll]{l}/";
$content_replacements_to[] = "Extra large";

// Tidy up D-radius
$content_replacements_from[] = "/[Dd]{1}[\s\-]?[Rr]{1}adius/";
$content_replacements_to[] = "D-radius";

// Tidy up A-rate
$content_replacements_from[] = "/[Aa]{1}[\s\-]?[Rr]{1}ate/";
$content_replacements_to[] = "A-rate";

// Tidy up In-column
$content_replacements_from[] = "/[Ii]{1}n[\s\-]?[Cc]{1}olum[n]?/";
$content_replacements_to[] = "in-column";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Ii]{1}n[\s\-]?[Cc]{1}olum[n]?/";
$content_replacements_to[] = "In-column";
$content_replacements_from[] = "/^[Ii]{1}n[\s\-]?[Cc]{1}olum[n]?/";
$content_replacements_to[] = "In-column";

// Tidy up kW
$content_replacements_from[] = "/[Kk]{1}[Ww]{1}/";
$content_replacements_to[] = "kW";

// Tidy up Built-in
$content_replacements_from[] = "/[Bb]{1}uilt[\s\-]?[Ii]{1}n/";
$content_replacements_to[] = "built-in";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Bb]{1}uilt[\s\-]?[Ii]{1}n/";
$content_replacements_to[] = "Built-in";
$content_replacements_from[] = "/^[Bb]{1}uilt[\s\-]?[Ii]{1}n/";
$content_replacements_to[] = "Built-in";

// Tidy up Built-under
$content_replacements_from[] = "/[Bb]{1}uilt[\s\-]?[Uu]{1}nder/";
$content_replacements_to[] = "built-under";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Bb]{1}uilt[\s\-]?[Uu]{1}nder/";
$content_replacements_to[] = "Built-under";
$content_replacements_from[] = "/^[Bb]{1}uilt[\s\-]?[Uu]{1}nder/";
$content_replacements_to[] = "Built-under";

// Tidy up Under-counter
$content_replacements_from[] = "/[Uu]{1}nder[\s\-]?[Cc]{1}ounter/";
$content_replacements_to[] = "under-counter";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Uu]{1}nder[\s\-]?[Cc]{1}ounter/";
$content_replacements_to[] = "Under-counter";
$content_replacements_from[] = "/^[Uu]{1}nder[\s\-]?[Cc]{1}ounter/";
$content_replacements_to[] = "Under-counter";

// Tidy up Under-cabinet
$content_replacements_from[] = "/[Uu]{1}nder[\s\-]?[Cc]{1}abinet/";
$content_replacements_to[] = "under-cabinet";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Uu]{1}nder[\s\-]?[Cc]{1}abinet/";
$content_replacements_to[] = "Under-cabinet";
$content_replacements_from[] = "/^[Uu]{1}nder[\s\-]?[Cc]{1}abinet/";
$content_replacements_to[] = "Under-cabinet";

// Tidy up integrated
$content_replacements_from[] = "/([a-zA-Z0-9]{1})[\s]{1}[\-]{1}[Ii]{1}ntegrated/";
$content_replacements_to[] = "$1-integrated";

// Tidy up Semi-integrated
$content_replacements_from[] = "/[Ss]{1}emi[\s\-]?[Ii]{1}ntegrated/";
$content_replacements_to[] = "semi-integrated";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Ss]{1}emi[\s\-]?[Ii]{1}ntegrated/";
$content_replacements_to[] = "Semi-integrated";
$content_replacements_from[] = "/^[Ss]{1}emi[\s\-]?[Ii]{1}ntegrated/";
$content_replacements_to[] = "Semi-integrated";

// Tidy up Fully-integrated
$content_replacements_from[] = "/[Ff]{1}ully[\s\-]?[Ii]{1}ntegrated/";
$content_replacements_to[] = "fully-integrated";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Ff]{1}ully[\s\-]?[Ii]{1}ntegrated/";
$content_replacements_to[] = "Fully-integrated";
$content_replacements_from[] = "/^[Ff]{1}ully[\s\-]?[Ii]{1}ntegrated/";
$content_replacements_to[] = "Fully-integrated";

// Tidy up Semi-automatic
$content_replacements_from[] = "/[Ss]{1}emi[\s\-]?[Aa]{1}utomatic/";
$content_replacements_to[] = "semi-automatic";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Ss]{1}emi[\s\-]?[Aa]{1}utomatic/";
$content_replacements_to[] = "Semi-automatic";
$content_replacements_from[] = "/^[Ss]{1}emi[\s\-]?[Aa]{1}utomatic/";
$content_replacements_to[] = "Semi-automatic";

// Tidy up Fully-automatic
$content_replacements_from[] = "/[Ff]{1}ully[\s\-]?[Aa]{1}utomatic/";
$content_replacements_to[] = "fully-automatic";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Ff]{1}ully[\s\-]?[Aa]{1}utomatic/";
$content_replacements_to[] = "Fully-automatic";
$content_replacements_from[] = "/^[Ff]{1}ully[\s\-]?[Aa]{1}utomatic/";
$content_replacements_to[] = "Fully-automatic";

// Tidy up Pull-out
$content_replacements_from[] = "/[Pp]{1}ull[\s\-]?[Oo]{1}ut/";
$content_replacements_to[] = "pull-out";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Pp]{1}ull[\s\-]?[Oo]{1}ut/";
$content_replacements_to[] = "Pull-out";
$content_replacements_from[] = "/^[Pp]{1}ull[\s\-]?[Oo]{1}ut/";
$content_replacements_to[] = "Pull-out";

// Tidy up including
$content_replacements_from[] = "/\s[Ii]{1}nc[l]?[\.]?\s/";
$content_replacements_to[] = " including ";

// Tidy up use
$content_replacements_from[] = "/\s[Uu]{1}se\s/";
$content_replacements_to[] = " use ";

// Tidy up ?-piece
$content_replacements_from[] = "/([2345TtYy]{1})[\s\-]?[Pp]{1}iece/";
$content_replacements_to[] = "$1-piece";

// Tidy up ?-spout
$content_replacements_from[] = "/([Cc]{1})[\s\-]?[Ss]{1}pout/";
$content_replacements_to[] = "$1-spout";

// Tidy up ?-end
$content_replacements_from[] = "/([Cc]{1})[\s\-]?[Ee]{1}nd/";
$content_replacements_to[] = "$1-end";

// Tidy up Brushed Steel
$content_replacements_from[] = "/[Bb]{1}[\-\/]{1}[Ss]{1}teel/";
$content_replacements_to[] = "brushed steel";

// Tidy up Stainless Steel
$content_replacements_from[] = "/[Ss]{1}[\-\/]{1}[Ss]{1}teel/";
$content_replacements_to[] = "stainless steel";

// Tidy up Silk Steel
$content_replacements_from[] = "/[Ss]{1}ilk[\s]?[Ss]{1}teel/";
$content_replacements_to[] = "silk steel";

// Remove trade marks
$content_replacements_from[] = "/™/";
$content_replacements_to[] = "";

// Replace long dashes
$content_replacements_from[] = "/–/";
$content_replacements_to[] = "-";

// Replace single quotes
$content_replacements_from[] = "/’/";
$content_replacements_to[] = "'";
$content_replacements_from[] = "/`/";
$content_replacements_to[] = "'";

// Tidy up m
$content_replacements_from[] = "/[\s]?[Mm]{1}etre/";
$content_replacements_to[] = "m";

// Tidy up m3
$content_replacements_from[] = "/([0-9]{1})[\s]?[Mm]{1}3/";
$content_replacements_to[] = "$1m³";
$content_replacements_from[] = "/\&sup3\;/";
$content_replacements_to[] = html_entity_decode("³");

// Tidy up to in between numbers
$content_replacements_from[] = "/([0-9]{1})[\s]?to[\s]?([0-9]{1})/";
$content_replacements_to[] = "$1 - $2";

// Tidy up per hour
$content_replacements_from[] = "/\s[Aa]{1}nd\s[Hh]{1}[Rr]?$/";
$content_replacements_to[] = "ph";

// Tidy up l
$content_replacements_from[] = "/[\s]?[Ll]{1}itre/";
$content_replacements_to[] = "l";

// Tidy up -in
$content_replacements_from[] = "/\-[Ii]{1}n/";
$content_replacements_to[] = "-in";

// Tidy up plus
$content_replacements_from[] = "/\s[Pp]{1}lus\s/";
$content_replacements_to[] = " plus ";

// Tidy up including
$content_replacements_from[] = "/\s[Ii]{1}ncluding\s/";
$content_replacements_to[] = " including ";

// Tidy up including
$content_replacements_from[] = "/[Ii]{1}nc\s/";
$content_replacements_to[] = "Including "; 

// Tidy up Push/pull
$content_replacements_from[] = "/[Pp]{1}ush\/[Pp]{1}ull/";
$content_replacements_to[] = "push/pull";
$content_replacements_from[] = "/[\.|\!|\?]{1}\s{1}[Pp]{1}ush\/[Pp]{1}ull/";
$content_replacements_to[] = "Push/pull";
$content_replacements_from[] = "/^[Pp]{1}ush\/[Pp]{1}ull/";
$content_replacements_to[] = "Push/pull";

// Tidy up +
$content_replacements_from[] = "/\s\+\s/";
$content_replacements_to[] = " and ";

// Tidy up *
$content_replacements_from[] = "/\*/";
$content_replacements_to[] = "";

// Tidy up with
$content_replacements_from[] = "/\s[Ww]{1}ith\s/";
$content_replacements_to[] = " with ";

// Tidy up without
$content_replacements_from[] = "/\s[Ww]{1}ithout\s/";
$content_replacements_to[] = " without ";

// Tidy up in
$content_replacements_from[] = "/\s[Ii]{1}n\s/";
$content_replacements_to[] = " in ";

// Tidy up of
$content_replacements_from[] = "/\s[Oo]{1}f\s/";
$content_replacements_to[] = " of ";

// Tidy up for
$content_replacements_from[] = "/\s[Ff]{1}or\s/";
$content_replacements_to[] = " for ";

// Tidy up or
$content_replacements_from[] = "/\s[Oo]{1}r\s/";
$content_replacements_to[] = " or ";

// Tidy up and
$content_replacements_from[] = "/\s[Aa]{1}nd\s/";
$content_replacements_to[] = " and ";

// Tidy up to
$content_replacements_from[] = "/\s[Tt]{1}o\s/";
$content_replacements_to[] = " to ";

// Tidy up too
$content_replacements_from[] = "/\s[Tt]{1}oo\s/";
$content_replacements_to[] = " too ";

// Tidy up &
$content_replacements_from[] = "/\s&\s/";
$content_replacements_to[] = " and ";

// Tidy up &
$content_replacements_from[] = "/\s&\s/";
$content_replacements_to[] = " and ";

// Tidy up mm
$content_replacements_from[] = "/M[Mm]{1}/";
$content_replacements_to[] = "mm";

// Tidy up ize to ise
$content_replacements_from[] = "/([a-zA-Z]{2})ize{1}/";
$content_replacements_to[] = "$1ise";

// Tidy up izer to iser
$content_replacements_from[] = "/([a-zA-Z]{2})izer{1}/";
$content_replacements_to[] = "$1iser";

// Tidy up yze to yse
$content_replacements_from[] = "/([a-zA-Z]{2})yze{1}/";
$content_replacements_to[] = "$1yse";

// Tidy up ization to isation
$content_replacements_from[] = "/([a-zA-Z]{2})ization{1}/";
$content_replacements_to[] = "$1isation";

// Tidy up times symbol
$content_replacements_from[] = "/([0-9]{1})\s*[Xx]\s*([0-9A-Za-z]{1})/";
$content_replacements_to[] = "$1 × $2";

// Tidy up times symbol
$content_replacements_from[] = "/\&times\;/";
$content_replacements_to[] = html_entity_decode("×");

// Tidy up inches
$content_replacements_from[] = "/([0-9]{1})\s*[Ii]{1}nches/";
$content_replacements_to[] = "$1\"";

// Tidy up inch
$content_replacements_from[] = "/([0-9]{1})\s*[Ii]{1}nch/";
$content_replacements_to[] = "$1\"";

// Make the replacements
$content = preg_replace($content_replacements_from, $content_replacements_to, $content);

这显然是复杂而冗长的。

有没有人知道更好的方法或知道可以做到这一点的类(class)?

如果可能的话,我还想将其应用于 HTML 中的内容。

最佳答案

正则表达式非常适合文本搜索和替换。你得到的那个表明还有改进的余地。但我的回答不是关于优化这些,而是​​我建议开始构建你自己的 StringCleaner 集,它可以做不同的事情,但都具有相同的界面:

interface StringCleaner
{
    public function clean($string);
}

接下来,对于 HTML,我的一个想法是创建一个 FilterIterator,它提供对所有文本节点的访问,因此可以更轻松地使用任何标准清洁器更改它们。

要一次应用多个 StringCleaner(并创建它们的集合),我使用了 Composite Pattern (通过从 SplObjectStore 扩展)它本身也是一个 StringCleaner

没有类定义的例子:

$cleanerTrim = new TrimCleaner();

$cleanerBasics = new RegexCleaner();

// Remove any spaces around slashes
$cleanerBasics->addRule('\s*\/\s*', '/');

// Remove any new lines or tabs
$cleanerBasics->addRule('[\r\n\t]', ' ');

// Tidy up joined full stops
$cleanerBasics->addRule('(\w+)\.(?!jpeg|jpg|png|pdf|gif|doc|xls|docx|xlsx|ppt|pptx|html|php|htm)(\w+)', '$1. $2');

// Remove any extra spaces
$cleanerBasics->addRule('\s{2,}', ' ');

// Remove single spaces
$cleanerBasics->addRule('^\s$', '');

$cleanerInches = new RegexCleaner();

// Tidy up inches
$cleanerInches->addRule('([0-9])\s*[Ii]nches', '$1"');


$cleaner = new CleanerComposite();
$cleaner->attach($cleanerBasics);
$cleaner->attach($cleanerInches);
$cleaner->attach($cleanerTrim);


$htmlString = <<<HTML
<html>
  <head>
    <title>
        hello world.hello earth.
    </title>
  </head>
  <body>
<table><tr><td>test. 
</td></tr></table>
     <h1>Get it 1 more time.</h1>
     <p>When 12 inches were not enough;      hickup.</p>

  </body>
</html>
HTML;


// load HTML
$dom = new DOMDocument();
$dom->preserveWhiteSpace = FALSE;
$dom->loadHTML($htmlString);

// create XPath
$xpath = new DomXPath($dom);

$it = new DOMTextWhiteSpaceFilter($xpath->query('//text()'));
foreach($it as $node)
{
    $node->data = $cleaner->clean($node->data);
}

// remove whitespace only nodes
$it = new DOMTextWhiteSpaceFilter($xpath->query('//text()'), DOMTextWhiteSpaceFilter::WHITESPACE);
foreach($it as $node)
{
    $node->parentNode->removeChild($node);
}

$dom->formatOutput = true;
echo $dom->saveHTML();

如示例所示,当您将复杂性隐藏到具体的 StringCleaner 对象中时,您可以开始创建更多动态规则。这可以通过添加更多操作不同于正则表达式的StringCleaner 类型来扩展,TrimCleaner 中给出了一个非常简单的trim 示例。

当然,正则表达式也非常强大。正如您在 RegexCleaner 中看到的那样,我已将每个正则表达式定界符移到类本身中,因此当您定义规则时,您不需要一遍又一遍地输入它们。这只是另一个简单的示例,当您将替换封装到它自己的类中并为操作定义接口(interface)时,您可以简化事情。

Full Example .

关于php - 是否有可以清理内容的 PHP 类?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/7888752/

相关文章:

javascript - 删除字符串,但保持其中间部分

regex - 在 Linux 服务器上使用 "find"和正则表达式否定

Java/Android 正则表达式问题

string - 给定一个字符串 X 和该字符串的反转 Y。 X 和 Y 的最长公共(public)序列是否总是回文?

c - 如何以标准方式修剪前导/尾随空格?

php - "=>"在 PHP 中是什么意思?

javascript - 如何将 php 变量作为参数传递给 JavaScript 函数

php - sudo: 不存在 tty 且未指定 askpass 程序使用 shell_exec 时

javascript - 从导航栏到同一页面内主要部分的链接(内部 php/html 文件)

c - 使用 sscanf 匹配零个或多个空格