From f13946e88b4ef2a8843c01505c06d1fb74dad732 Mon Sep 17 00:00:00 2001
From: Teddy <ted.sybil@gmail.com>
Date: Wed, 7 May 2014 16:59:45 +0800
Subject: report: type system

---
 cibic.tex | 116 +++++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 93 insertions(+), 23 deletions(-)

(limited to 'cibic.tex')

diff --git a/cibic.tex b/cibic.tex
index c1a5d87..9324b9d 100644
--- a/cibic.tex
+++ b/cibic.tex
@@ -71,8 +71,9 @@ to let the user do some extra actions after each token is read. So I maintained
 a global variable ``\texttt{yycolumn}'' to keep the current column, a global
 char array ``\texttt{linebuff}'' to buffer the text being read for error
 report.
-\begin{figure}
+\begin{listing}[H]
     \centering
+    \RecustomVerbatimEnvironment{Verbatim}{BVerbatim}{}
     \begin{minted}{c}
 int yycolumn = 1;
 char linebuff[MAX_LINEBUFF], *lptr = linebuff;
@@ -94,7 +95,7 @@ char linebuff[MAX_LINEBUFF], *lptr = linebuff;
     } while (0)
 \end{minted}
 \caption {Code Snippet to Track Down Location}
-\end{figure}
+\end{listing}
 
 \section{Semantic Analysis and Implementation}
 The parsing process is also an AST (Abstrct Syntax Tree) construction process.
@@ -195,24 +196,26 @@ depicted in figure.
 Thanks to the level information kept in each ``\texttt{CTNode}'', we do not
 have to establish a hash table for every scopes, which may be memory consuming.
 Instead, whenever a new scope begins, CIBIC simply pushes a new frame to scope
-stack. This is achieved by creating an instance of ``\texttt{CSNode}'', setting its
-``\texttt{next}'' field to the ``\texttt{top}'' field of the
+stack. This is achieved by creating an instance of ``\texttt{CSNode}'', setting
+its ``\texttt{next}'' field to the ``\texttt{top}'' field of the
 ``\texttt{CScope}'' then letting the ``\texttt{top}'' points to the new frame,
-finally increasing ``\texttt{lvl}'' by one. Whenever a new symbol is being added
-to ``\texttt{CScope}'', CIBIC adds the symbol to one of the tables
+finally increasing ``\texttt{lvl}'' by one. Whenever a new symbol is being
+added to ``\texttt{CScope}'', CIBIC adds the symbol to one of the tables
 ``\texttt{ids}'' and ``\texttt{tags}'', then also appends the symbol to the
-``\texttt{symlist}'' of the top of the scope stack. The most elegant characteristics of
-open addressing hash tables is, for the same name appears in different scopes,
-the symbol defined at the inner most is always ahead of others in the chain of
-the table because it is the most recently added one. So, for lookups, CIBIC
-just need to return the first node having the same name in the chain, which is very
-time-efficient. At last, when a scope ends, CIBIC scans the whole
-``\texttt{symlist}'' of the top scope frame, and tries to remove these symbols
-from the table. Figure presents the content of the scope stack when the
-analysis proceeds into the inner most declaration of a. See chains with hash
-code 0097 and 0098.
+``\texttt{symlist}'' of the top of the scope stack. The most elegant
+characteristics of open addressing hash tables is, for the same name appears in
+different scopes, the symbol defined at the inner most is always ahead of
+others in the chain of the table because it is the most recently added one. So,
+for lookups, CIBIC just need to return the first node having the same name in
+the chain, which is very time-efficient. At last, when a scope ends, CIBIC
+scans the whole ``\texttt{symlist}'' of the top scope frame, and tries to
+remove these symbols from the table. Figure presents the content of the scope
+stack when the analysis proceeds into the inner most declaration of a. See
+chains with hash code 0097 and 0098.
 \begin{figure}[H]
-    \centering
+%    \centering
+    \begin{minipage}{0.35\textwidth}
+%    \RecustomVerbatimEnvironment{Verbatim}{BVerbatim}{}
     \begin{minted}{c}
 int main() {
     int a, b;
@@ -226,11 +229,10 @@ int main() {
     }
 }
     \end{minted}
-\caption {Source Code Being Proceeded}
-\end{figure}
-
-\begin{figure}[H]
-    \centering
+ \caption {Source Code Being Proceeded}
+\end{minipage}
+%    \centering
+    \begin{minipage}{0.5\textwidth}
     \begin{BVerbatim}
     [0072]->[int@747e780:0]
     [0097]->[a@7484ae0:3]->[a@7484890:2]->[a@7484580:1]
@@ -247,8 +249,76 @@ int main() {
     [0971]->[char@747e9f0:0]
     \end{BVerbatim}
     \caption {Dump Info from CIBIC: Scope Stack}
+\end{minipage}
 \end{figure}
-\subsection{Type Definition}
+
+\subsection{Type System}
+C has a small set of basic types. In CIBIC, basic types include \texttt{char},
+\texttt{int} and \texttt{void} (literally, \texttt{void} is not an actual
+type). However, C supports two powerful type aggregation: arrays and structures
+(unions), and also supports an indirect access tool: pointer. This makes the
+type system much more complex and usable in practice. CIBIC uses the concept
+``type tree'' to organize types. All basic types are the leaves of a type tree,
+while aggregate types and pointers are the intermediate nodes. Figure shows a
+typical type tree of a C struct. 
+
+A type tree is good at preserving type hierarchy info which may be extremely
+useful in type checking. For example, when checking a expression \texttt{*a},
+compiler first check if the root of the type tree of a is a pointer. If not,
+error message will be printed and semantic checking fails. Otherwise, the type
+of the expression result is the subtree of the only child of the root. Also, a
+type tree enables us to implement complex type nesting, which will be discussed
+later on.
+
+CIBIC has support for user-defined types, which are defined via the keyword
+``\texttt{typedef}''.  However, ``\texttt{typedef}'' is notoriously difficult
+to deal with due to the ambiguity caused by the language design. For example,
+in ``\texttt{int A}'', A is a \textbf{variable} of integer type, but in
+``\texttt{A a}'', A is a user-defined \textbf{type}. The subtle semantic
+difference is caused by context. In former case, A is identified as a
+identifier token, while in latter case, identified as a type specifier. The
+meaning of a token should be made clear during lexical analysis which does not
+take in account of context. So the most direct and effective way to implement
+\texttt{typedef} is to hack the lexer and parser. CIBIC maintains a
+``\texttt{typedef\_stack}'' to denote current parsing status. When a parser has
+just read a type specifier, before it moves on, it invokes
+``\texttt{def\_enter(FORCE\_ID)}'' to notify ``\texttt{typedef\_stack}'' the
+subsequent string token must be an identifier instead of a type specifier. As
+for ``\texttt{typedef}'' statements, the parser will invoke
+``\texttt{def\_enter(IN\_TYPEDEF)}'' to record the newly defined typename by
+adding an entry to a hash table. In the lexer, when a string token is being
+read, it invokes ``\texttt{is\_identifier}'' to make sure whether the string is
+an \texttt{IDENTIFIER} or a \texttt{USER\_TYPE}. 
+
+Figure demonstrates a very subtle use of \texttt{typedef}, which can be parsed perfectly by CIBIC.
+
+\begin{listing}[H]
+    \centering
+    \RecustomVerbatimEnvironment{Verbatim}{BVerbatim}{}
+    \begin{minted}{c}
+/* It's quite common to define a shorthand `I' for `struct I'. */
+typedef struct I I; 
+/* Declaration of a function with parameter `a' of user-defined type `I'. */
+int incomp(I a);
+/* The definition of `I' is postponed. */
+struct I { 
+    int i, j;
+};
+/* Function definition of previous declared one. */
+int incomp(I a) {}
+/* Define `b' as an int type. */
+typedef int b;
+int main() {
+    /* Variable `b' is of type `b', an integer actually. */
+    I i;
+    b b;
+    incomp(i);
+}
+\end{minted}
+\caption {Code Snippet to Track Down Location}
+\end{listing}
+
+
 \subsection{Complex Declaration and Function Pointer}
 \section{Intermadiate Representation}
 \subsection{Single Static Assignment Form}
-- 
cgit v1.2.3