javascript - 使用 parsimmon 库解析基于缩进的语言

标签 javascript parsing parser-combinators

我的问题的灵感来自 this one ,但对于 javascript,使用 parsimmon 解析器组合器库。我想解析缩进敏感的语言,例如 python 或 yaml。

我已经设法很容易地将那个答案中的 scala 示例转换为 javascript - 关键是 parsimmon 中的 chain 函数,它等同于 >>> scala 的解析器组合器中的运算符 - 它们都采用解析器和返回解析器的函数,并且第一个解析器的结果传递给函数以选择下一个解析器。

但是,我不太清楚如何使这个递归。该示例适用于单个 block - 我看不到如何创建嵌套 block 、根据需要跟踪去凹痕级别以解析 python 之类的东西。

最佳答案

我是 Parsimmon 的维护者。我意识到这个问题真的很老,但我偶然发现了它并想回答。

GitHub 上 parsimmon 存储库中的 python-ish.js 示例应该可以帮助您了解如何解析基于缩进的语言。

这与 Josh 的回答非常相似,但我认为更容易理解一些。

https://github.com/jneen/parsimmon/blob/master/examples/python-ish.js

"use strict";

// Run me with Node to see my output!

let util = require("util");
let P = require("..");

///////////////////////////////////////////////////////////////////////

// Because parsing indentation-sensitive languages such as Python requires
// tracking state, all of our parsers are created inside a function that takes
// the current parsing state. In this case it's just the current indentation
// level, but a real Python parser would also *at least* need to keep track of
// whether the current parsing is inside of () or [] or {} so that you can know
// to ignore all whitespace, instead of further tracking indentation.
//
// Implementing all of Python's various whitespace requirements, including
// comments and line continuations (backslash at the end of the line) is left as
// an exercise for the reader. I've tried and frankly it's pretty tricky.
function PyX(indent) {
  return P.createLanguage({
    // This is where the magic happens. Basically we need to parse a deeper
    // indentation level on the first statement of the block and keep track of
    // new indentation level. Then we make a whole new set of parsers that use
    // that new indentation level for all their parsing. Each line past the
    // first is required to be indented to the same level as that new deeper
    // indentation level.
    Block: r =>
      P.seqObj(
        P.string("block:"),
        r.NL,
        ["n", r.IndentMore],
        ["first", r.Statement]
      ).chain(args => {
        const { n, first } = args;
        return PyX(n)
          .RestStatement.many()
          .map(rest => ["BLOCK", first, ...rest]);
      }),

    // This is just a statement in our language. To simplify, this is either a
    // block of code or just an identifier
    Statement: r => P.alt(r.Block, r.Ident),

    // This is a statement which is indented to the level of the current parse
    // state. It's called RestStatement because the first statement in a block
    // is indented more than the previous state, but the *rest* of the
    // statements match up with the new state.
    RestStatement: r => r.IndentSame.then(r.Statement),

    // Just a variable and then the end of the line.
    Ident: r => P.regexp(/[a-z]+/i).skip(r.End),

    // Consume zero or more spaces and then return the number consumed. For a
    // more Python-like language, this parser would also accept tabs and then
    // expand them to the correct number of spaces
    //
    // https://docs.python.org/3/reference/lexical_analysis.html#indentation
    CountSpaces: () => P.regexp(/[ ]*/).map(s => s.length),

    // Count the current indentation level and assert it's more than the current
    // parse state's desired indentation
    IndentSame: r =>
      r.CountSpaces.chain(n => {
        if (n === indent) {
          return P.of(n);
        }
        return P.fail(`${n} spaces`);
      }),

    // Count the current indentation level and assert it's equal to the current
    // parse state's desired indentation
    IndentMore: r =>
      r.CountSpaces.chain(n => {
        if (n > indent) {
          return P.of(n);
        }
        return P.fail(`more than ${n} spaces`);
      }),

    // Support all three standard text file line endings
    NL: () => P.alt(P.string("\r\n"), P.oneOf("\r\n")),

    // Lines should always end in a newline sequence, but many files are missing
    // the final newline
    End: r => P.alt(r.NL, P.eof)
  });
}

// Start parsing at zero indentation
let Pythonish = PyX(0);

///////////////////////////////////////////////////////////////////////

let text = `\
block:
  alpha
  bravo
  block:
         charlie
         delta
         echo
         block:
          foxtrot
  golf
`;

function prettyPrint(x) {
  let opts = { depth: null, colors: "auto" };
  let s = util.inspect(x, opts);
  console.log(s);
}

let ast = Pythonish.Statement.tryParse(text);
prettyPrint(ast);

关于javascript - 使用 parsimmon 库解析基于缩进的语言,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/40499588/

相关文章:

javascript - 使用深度比较或 json.stringify 比较两个对象?

javascript - 无法检索 POST 数据

javascript - 如何制作一个 NodeJS 服务器,在连接时按时间间隔逐行发出字符串文件?

c - 在 C 中解析程序参数时有什么好的做法

scala - 如何避免 Fastparse 中的左递归无限循环?

java - 解析器组合器中的递归耗尽堆栈空间

javascript - Ajax 接收来自服务器的响应并将其保存在变量中以供以后 PhP 使用

javascript - 如何 '$parse' 指令中的作用域对象传递整个对象

Char 数组到数学方程

haskell - parsec:带有有用错误消息的字符串选择解析器