CS241e Source: Grammars


/*
   Copyright 2024 Ondrej Lhotak. All rights reserved.

   Permission is granted for private study use by students registered in
   CS 241E in the Fall 2024 term.

   The contents of this file may not be published, in whole or in part,
   in print or electronic form.

   The contents of this file may be included in work submitted for CS
   241E assignments in Fall 2024. The contents of this file may not be
   submitted, in whole or in part, for credit in any other course.
*/
package cs241e.scanparse

import DFAs.Token

object Grammars {

  /** A representation of a production rule in a context-free grammar. */
  case class Production(lhs: String, rhs: Seq[String]) {
    override def toString: String = lhs + " " + rhs.mkString(" ")
  }

  /** A representation of a context-free grammar. */
  case class Grammar(productions: Seq[Production]) {
    val nonTerminals = productions.map(_.lhs).toSet
    val symbols = nonTerminals ++ productions.flatMap(_.rhs).toSet
    val terminals = symbols -- nonTerminals
    val start = productions.head.lhs

    /** Given a non-terminal symbol, returns the set of production rules that have the symbol as their left-hand side.
      **/
    val productionsExpanding: Map[String, Seq[Production]] =
      productions.groupBy(_.lhs).withDefaultValue(Seq())

    override def toString: String = productions.mkString("\n")
  }

  /** A representation of a node of a parse tree.
    *
    * The grammar symbol corresponding to the tree node is in `lhs.kind`. If the tree node is a leaf node representing
    * a terminal (token), the lexeme of the token is in `lhs.lexeme`; otherwise, lhs.lexeme is the empty string.
    */
  class Tree(val lhs: Token, val children: Seq[Tree] = Seq.empty) {
    def production: String = lhs.kind + " " + children.map(_.lhs.kind).mkString(" ")
    def show(indent: Int = 0): String = " " * indent + lhs + "\n" + children.map(_.show(indent + 1)).mkString
    override def toString: String = show(0)
  }

  /** Generates a `Grammar` from a textual description. Each non-empty line of the input is considered a production
    * rule of the grammar. The symbols of the production rule are space-separated words on the line. The first word
    * is the lhs of the production rule, and the remaining words are the rhs. The lhs of the first production rule
    * is considered the start symbol of the grammar.
    */
  def parseGrammar(input: String): Grammar = {
    def parseLine(line: String): Option[Production] = {
      val tokens = line.trim.split(" ").filterNot(_.isEmpty)
      if (tokens.isEmpty) None else Some(Production(tokens.head, tokens.tail))
    }
    val productions = input.split("\n").flatMap(parseLine)
    Grammar(productions)
  }
}