doc/cibic.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925

\documentclass[10pt, a4paper]{article}

\usepackage{xeCJK, fontspec}
\usepackage{amsmath, verbatim}
\usepackage{setspace, float}
\usepackage{graphicx, wrapfig, algorithmicx}
\usepackage{rotating, enumerate, minted, fancyvrb, longtable}
\usepackage[left=1in, right=1in]{geometry}
\usepackage[font=small]{caption}

\defaultfontfeatures{Mapping=tex-text}
\setCJKmainfont[BoldFont={STHeiti}]{Adobe Song Std}
%\XeTeXlinebreaklocale "zh"
%\XeTeXlinebreakskip = 0pt plus 1pt minus 0.1pt

%\setlength{\parindent}{2em}
%\setlength{\parskip}{1em}
\makeatletter
\def\verbatim@font{\ttfamily\small}
\makeatother

%\makeatletter
%\let\@afterindentrestore\@afterindentfalse
%\let\@afterindentfalse\@afterindenttrue
%\@afterindenttrue
%\makeatother

\xeCJKsetup{CJKglue=\hspace{0pt plus .08 \baselineskip }}

\newcommand{\ud}{\mathrm{d}}
\usemintedstyle{colorful}
\begin{document}
\title{\Large{CIBIC: C Implemented Bare and Ingenuous Compiler}}

\author{\textsc{尹茂帆 Ted Yin}\\ \\\multicolumn{1}{p{.7\textwidth}}
    {\centering\emph{F1224004 5120309051\\
Shanghai Jiao Tong University ACM Class}}}
\maketitle
\begin{abstract}
    This report presents the design and features of a simple C compiler which
    generates MIPS assembly code. Although not all the language requirements and
    features are implemented according to the standard, it still supports major
    C features, such as basic types (void, int, char), basic flow control syntax
    (if, while-loop, for-loop), user-defined types (aka. typedef), functions,
    pointers (including function pointers), struct/union (may be nested), etc.
    Besides, it makes use of SSA (Single Static Assignment) form for the IR
    (Intermediate Representation) and optimization. The whole compiler is
    written in pure C, obeying C89/C99 standard. The report first introduces the
    lexer and parser generation, then focuses on the data structures being used
    in the AST (Abstract Syntax Tree) and symbol tables, and makes a conclusion
    on the supported syntactical features. Next, the report shows the
    intermediate representation design and claims the techniques that have been
    used in the project. Finally, various optimization techniques are presented
    and discussed, some are accompanied with code snippets.
\end{abstract}
\tableofcontents
\section{Lexer and Parser}
CIBIC makes good use of existing lexer and parser generators. It uses Flex to
generate lexer while using Bison as parser generator. Both generators read
boilerplate text which contains C code fragments to be filled in the generated
lexer/parser. The best practice, I suggest, is to avoid embedding too much
concrete code in the boilerplate. Instead, we shall write well-designed
functions in a separate file (``\texttt{ast.c}'' in CIBIC) and invoke functions
in the boilerplate. The reason is that it is almost not practical nor
convenient to debug the generated lexer or parser. Using the seperation method,
we can set up breakpoints in the seperate file easily and debug on the fly. The
declarations of all functions that may be invoked during parsing can be found
in``\texttt{ast.h}''.

It requires a little more effort to track the location of each token using
Flex.  Luckily, Flex provides with a macro called ``\texttt{YY\_USER\_ACTION}''
to let the user do some extra actions after each token is read. So I maintained
a global variable ``\texttt{yycolumn}'' to keep the current column, a global
char array ``\texttt{linebuff}'' to buffer the text being read for error
report.
\begin{listing}[H]
    \centering
    \RecustomVerbatimEnvironment{Verbatim}{BVerbatim}{}
    \begin{minted}{c}
int yycolumn = 1;
char linebuff[MAX_LINEBUFF], *lptr = linebuff;

#define YY_USER_ACTION \
    do { \
        yylloc.first_line = yylloc.last_line = yylineno; \
        yylloc.first_column = yycolumn; \
        yylloc.last_column = yycolumn + yyleng - 1; \
        yycolumn += yyleng; \
        memmove(lptr, yytext, yyleng);  \
        lptr += yyleng; \
    } while (0);

#define NEW_LINE_USER_ACTION \
    do { \
        yycolumn = 1; \
        lptr = linebuff; \
    } while (0)
\end{minted}
\caption {Code Snippet to Track Down Location}
\end{listing}
\subsection{Friendly Error Report}
CIBIC generates friendly and accurate error report. The corresponding line and
column number are printed. Besides, if it is an syntax error, CIBIC will print
the context where the error occurs just like clang.
\begin{figure}[H]
    \centering
\begin{BVerbatim}
2:28: error: syntax error, unexpected ';', expecting ',' or ')'
    int this_is_an_example(;
                           ^

2:13: error: syntax error, unexpected identifier
    typedef a
            ^
\end{BVerbatim}
\caption {Some Error Report Examples}
\end{figure}

\section{Semantic Analysis and Implementation}
The parsing process is also an AST (Abstrct Syntax Tree) construction process.
By calling the corresponding functions, the generated parser creates tree nodes
and merge them into subtrees. The major issue here is how to design a proper
data structure to represent and maintain the AST. In CIBIC, all nodes in an AST
are instances of struct ``CNode'' in figure \ref{fig:struct_cnode}.
\begin{figure}[H]
    \centering
    \texttt {
    \begin{tabular}{|c|}
        \hline
        type \\
        \footnotesize{describe syntax information e.g. expression or declarator} \\ \hline
        rec: \\
        \footnotesize{\textbf{union of intval / subtype / strval}} \\
        \footnotesize{stores detailed information e.g. which kind of expression it is} \\ \hline
        ext: \\
        \footnotesize{\textbf{struct of type, var, autos, const\_val, is\_const, offset}} \\
        \footnotesize{extended info, where annotation is made} \\ \hline
        chd \\
        \footnotesize{the left most child of the node} \\ \hline
        next \\
        \footnotesize{the next sibling} \\ \hline
        loc \\
        \footnotesize{\textbf{struct of row, col}} \\
        \footnotesize{the location in the source code, for error report} \\ \hline
        \end{tabular}
    }
    \caption{The Structure of a CNode instance}
    \label{fig:struct_cnode}
\end{figure}
Since a node may have variable number of children, the most common and
efficient approach is to implement a left-child right-sibling binary tree. The
field ``\texttt{chd}'' points to the child and ``\texttt{next}'' points to the
next sibling. This implementation is extremely flexible because we do not need
to know the number of children in advance, which brings us the convenience of
appending argument nodes to a function invocation node in the boilerplate.

After construction of the AST, the tree will be traversed by calling mutually
recursive functions declared in ``\texttt{semantics.h}''. The entry call is
``\texttt{semantics\_check}'' which will set up the symbol tables and call
``\texttt{semantics\_func}'' and ``\texttt{semantics\_decl}'' to analyze
function definitions and global declarations. These functions will further
invoke more basic functions such as ``\texttt{semantics\_comp}'' (handles
compound statements) to fully check the semantics and gather variable typing
information.

The key data structures in the semantic checking are \textbf{symbol tables}.
Symbol tables maintain variables and types in different scopes across different
name spaces. When a variable is defined, the corresponding type specifer will
be checked and binds to the variable. Also, when the scope ends, all variable
bindings living within the scope will be removed from the table. Although they
are removed from the table, the bindings are still referenced by some nodes on
the AST, so that the AST becomes ``\textbf{annotated AST}''. In CIBIC, the
variable reference is stored in ``\texttt{ext.var}'' field of
``\texttt{CNode}'' and the type information of an subexpression is annotated at
``\texttt{ext.type}''. Thus, in a word, symbol tables stores all symbols that
are currently \textbf{visible}.

In C language, there are four name spaces: for label names, tags, members of
structures or unions and all other identifiers. Goto statments are not
implemented in CIBIC, so there're actually three name spaces. Since each kind of
structures or unios have its own name space for its fields, we treat them
specially and create a new table for each of them. For tags and all other
identifiers, we set up two global tables. Besides name spaces, scoping is also a
very important concept in C. It seems to be an orthogonal concept to name
spaces.

Considering these two concepts, CIBIC implements a struct named
``\texttt{CScope}'' to maintain all the information as shown in figure
\ref{fig:struct_cscope}.
\begin{figure}[H]
    \centering
    \texttt {
    \begin{tabular}{|c|}
        \hline
        lvl \\
        \footnotesize{current nesting level of scopes, e.g. 0 for global scope} \\ \hline
        func \\
        \footnotesize{current function, helpful when analyzing a return statement} \\ \hline
        inside\_loop \\
        \footnotesize{if the scope is inside a loop, help full when analyzing a break statement} \\ \hline
        top \\
        \footnotesize{points to the top of the scope stack} \\ \hline
        ids \\
        \footnotesize{name space for all other variables} \\ \hline
        tags \\
        \footnotesize{name space for all tags (name of structs or unions)} \\ \hline
        \end{tabular}
    }
    \caption{The Structure of a CScope instance}
    \label{fig:struct_cscope}
\end{figure}

Note that ``\texttt{top}'' points to an instance of ``\texttt{CSNode}'' which
has two fields ``\texttt{symlist}'' and ``\texttt{next}''. ``\texttt{symlist}''
points to a list of symbols in the same scope while ``\texttt{next}'' links to
the outer scope which is another instance of ``\texttt{CSNode}''. As for
``\texttt{ids}'' and ``\texttt{tags}'', they are pointers to
``\texttt{CTable}'',stores all the current symbols. As mentioned above, for
each struct or union, there is also a pointer to ``\texttt{CTable}'' stores all
field names. ``\texttt{CTable}'' is an open addressing hash table
containing nodes of the type ``\texttt{CTNode}''. Inspired by \cite{grune12},
the structure of each node is designed as in figure \ref{fig:struct_ctnode}.

\begin{figure}[H]
    \centering
    \texttt {
    \begin{tabular}{|c|}
        \hline
        key \\
        \footnotesize{char *} \\ \hline
        val \\
        \footnotesize{void *, in order to also be used in checking duplicate parameters, etc.} \\ \hline
        next \\
        \footnotesize{the next element which has the same hash value} \\ \hline
        lvl \\
        \footnotesize{scope level} \\ \hline
        \end{tabular}
    }
    \caption{The Structure of a CTNode instance}
    \label{fig:struct_ctnode}
\end{figure}

Thanks to the level information kept in each ``\texttt{CTNode}'', we do not
have to establish a hash table for every scopes, which may be memory consuming.
Instead, whenever a new scope begins, CIBIC simply pushes a new frame to scope
stack. This is achieved by creating an instance of ``\texttt{CSNode}'', setting
its ``\texttt{next}'' field to the ``\texttt{top}'' field of the
``\texttt{CScope}'' then letting the ``\texttt{top}'' points to the new frame,
finally increasing ``\texttt{lvl}'' by one. Whenever a new symbol is being
added to ``\texttt{CScope}'', CIBIC adds the symbol to one of the tables
``\texttt{ids}'' and ``\texttt{tags}'', then also appends the symbol to the
``\texttt{symlist}'' of the top of the scope stack. The most elegant
characteristics of open addressing hash tables is, for the same name appears in
different scopes, the symbol defined at the inner most is always ahead of
others in the chain of the table because it is the most recently added. So,
for lookups, CIBIC just need to return the first node having the same name in
the chain, which is very time-efficient. At last, when a scope ends, CIBIC
scans the whole ``\texttt{symlist}'' of the top scope frame, and tries to remove
these symbols from the table. Figure \ref{fig:nested_scope} presents the content
of the scope stack when the analysis proceeds into the inner most declaration of
a. Chains with hash code 0097 and 0098 in figure \ref{fig:scope_stack} reveal
the underlying mechanism.
\begin{figure}[H]
%    \centering
    \begin{minipage}{0.35\textwidth}
%    \RecustomVerbatimEnvironment{Verbatim}{BVerbatim}{}
    \begin{minted}{c}
int main() {
    int a, b;
    if (a > 0)
    {
        int a, b;
        if (a > 0)
        {
            int a, b;
        }
    }
}
    \end{minted}
    \caption {Nested Scope Example}
    \label{fig:nested_scope}
\end{minipage}
%    \centering
    \begin{minipage}{0.5\textwidth}
    \begin{BVerbatim}
    [0072]->[int@747e780:0]
    [0097]->[a@7484ae0:3]->[a@7484890:2]->[a@7484580:1]
    [0098]->[b@7484bb0:3]->[b@7484960:2]->[b@7484710:1]
    [0108]->[printf@747f150:0]
    [0188]->[scanf@747f640:0]
    [0263]->[__print_string@7484010:0]
    [0278]->[__print_char@7483fc0:0]
    [0623]->[__print_int@747f8b0:0]
    [0778]->[malloc@7484100:0]
    [0827]->[void@747eee0:0]
    [0856]->[main@7484530:0]
    [0908]->[memcpy@7484060:0]
    [0971]->[char@747e9f0:0]
    \end{BVerbatim}
    \caption {CIBIC Dump: Scope Stack}
    \label{fig:scope_stack}
\end{minipage}
\end{figure}

\subsection{Type System}
C has a small set of basic types. In CIBIC, basic types include \texttt{char},
\texttt{int} and \texttt{void} (literally, \texttt{void} is not an actual
type). However, C supports two powerful type aggregation: arrays and structures
(unions), and also supports an indirect access tool: pointer. This makes the
type system much more complex and usable in practice. CIBIC uses the concept
``type tree'' to organize types. All basic types are the leaves of a type tree,
while aggregate types and pointers are the intermediate nodes. Figure
\ref{fig:type_tree} shows a typical type tree of a C struct.

\begin{figure}
    \centering
    \includegraphics[scale=0.5]{tree.png}
    \caption{A Typical Type Tree}
    \label{fig:type_tree}
\end{figure}

A type tree is good at preserving type hierarchy info which may be extremely
useful in type checking. For example, when checking a expression \texttt{*a},
compiler first check if the root of the type tree of a is a pointer. If not,
error message will be printed and semantic checking fails. Otherwise, the type
of the expression result is the subtree of the only child of the root. Also, a
type tree enables us to implement complex type nesting, which will be discussed
later on.
\subsection{\texttt{typedef} support}
CIBIC has support for user-defined types, which are defined via the keyword
``\texttt{typedef}''.  However, ``\texttt{typedef}'' is notoriously difficult
to deal with due to the ambiguity caused by the language design. For example,
in ``\texttt{int A}'', A is a \textbf{variable} of integer type, but in
``\texttt{A a}'', A is a user-defined \textbf{type}. The subtle semantic
difference is caused by context. In former case, A is identified as a
identifier token, while in latter case, identified as a type specifier. The
meaning of a token should be made clear during lexical analysis which does not
take in account of context. So the most direct and effective way to implement
\texttt{typedef} is to hack the lexer and parser. CIBIC maintains a
``\texttt{typedef\_stack}'' to denote current parsing status. When a parser has
just read a type specifier, before it moves on, it invokes
``\texttt{def\_enter(FORCE\_ID)}'' to notify ``\texttt{typedef\_stack}'' the
subsequent string token must be an identifier instead of a type specifier. As
for ``\texttt{typedef}'' statements, the parser will invoke
``\texttt{def\_enter(IN\_TYPEDEF)}'' to record the newly defined typename by
adding an entry to a hash table. As for the lexer, when a string token is being
read, it invokes ``\texttt{is\_identifier}'' to make sure whether the string is
an \texttt{IDENTIFIER} or a \texttt{USER\_TYPE}.

Listing \ref{list:typedef} demonstrates a very subtle use of \texttt{typedef},
which can be parsed perfectly by CIBIC.

\begin{listing}[H]
    \centering
    \RecustomVerbatimEnvironment{Verbatim}{BVerbatim}{}
    \begin{minted}{c}
/* It's quite common to define a shorthand `I' for `struct I'. */
typedef struct I I;
/* Declaration of a function with parameter `a' of user-defined type `I'. */
int incomp(I a);
/* The definition of `I' is postponed. */
struct I {
    int i, j;
};
/* Function definition of previous declared one. */
int incomp(I a) {}
/* Define `b' as an int type. */
typedef int b;
int main() {
    /* Variable `b' is of type `b', an integer actually. */
    I i;
    b b;
    incomp(i);
}
\end{minted}
\caption {\texttt{typedef} Example}
\label {list:typedef}
\end{listing}
\newpage
\subsection{Complex Declaration and Function Pointer}
With the help of type tree, CIBIC supports complex type declaration and
function pointers. The code below shows declaration with different semantics.
The subsequent dump information shows the corresponding type trees. The
hexidecimal value after ``\texttt{@}'' is the memory address of corresponding
instance in the implementation. For example, local variable \texttt{a} is an
instance of struct whose address is \texttt{735da40}, and \texttt{next} field
of \texttt{struct A} also points to the type with the same address. In fact,
the address of a struct type instance is regarded as its unique id in CIBIC.
This guarantees that the type tree is correctly structured as shown in figure
\ref{fig:type_tree}. It is also worth noting that three different delcarations
of ``arrays'' have inherently different meaning. Also the code shows us a very
complex declaration of a function \texttt{func}. It is a selector which returns
one of the pointers to a function in its parameters.

\begin{listing}[H]
    \centering
    \RecustomVerbatimEnvironment{Verbatim}{BVerbatim}{}
    \begin{minted}[linenos=true,firstnumber=1]{c}
struct A {
    int x, y;
    struct B {
        int i, j;
    } b;
    struct A *next;
};

/* a function that returns a pointer to function */
int (*func(int flag, int (*f)(), int (*g)()))() {
    if (flag) return f;
    else return g;
}

int main() {
    struct A a;
    /* the follow types are distinctly different */
    int a0[10][20]; /* two-dimensional array */
    int (*a1)[20];  /* a pointer to array */
    int *a2[20];    /* an array of pointers */
    int **a3;       /* pointer to pointer */
    /* pointer to a function */
    int (*f)(), (*h)();
    /* function declaration, not a variable */
    int (*g(int ***e[10]))();
    /* complex type casting is also supported */
    f = (int (*)())(0x12345678);
    f = func(1, f, main); /* f */
    h = func(0, f, main); /* main */
    /* 0 1 */
    printf("%d %d\n", f == main, h == main);
}
    \end{minted}
 \caption {Complex Declaration Example}
\end{listing}
\begin{figure}[H]
    \centering
    \begin{BVerbatim}
[func:{name:func}{size:-1}
    {params:
        [var@735fd60:flag :: [int]],
        [var@735ff00:f :: [ptr]->
                [func:{name:}{size:-1}
                    {params:}
                    {local:}]->[int]],
        [var@7360080:g :: [ptr]->
                [func:{name:}{size:-1}
                    {params:}
                    {local:}]->[int]]}
    {local:}]->[ptr]->
        [func:{name:}{size:-1}
            {params:}
            {local:}]->[int]

[func:{name:main}{size:-1}
    {params:}
    {local:
        [var@7360d70:h :: [ptr]->
                [func:{name:}{size:-1}
                    {params:}
                    {local:}]->[int]],
        [var@7360c00:f :: [ptr]->
                [func:{name:}{size:-1}
                    {params:}
                    {local:}]->[int]],
        [var@7360a00:a3 :: [ptr]->[ptr]->[int]],
        [var@7360820:a2 :: [arr:{len:20}{size:80}]->[ptr]->[int]],
        [var@7360640:a1 :: [ptr]->[arr:{len:20}{size:-1}]->[int]],
        [var@7360460:a0 :: [arr:{len:10}{size:800}]->[arr:{len:20}{size:80}]->[int]],
        [var@7360110:a :: 
            [struct@735da40:{name:A}{size:20}{fields:
                [var@735d9b0:b :: 
                    [struct@735b730:{name:B}{size:8}{fields:
                        [var@735d7d0:i :: [int]],
                        [var@735d8b0:j :: [int]]}]],
                [var@735b5c0:x :: [int]],
                [var@735b6a0:y :: [int]],
                [var@735db50:next :: [ptr]->
                        [struct@735da40:{name:A}]]}]]}]->[int]

    \end{BVerbatim}
    \caption {CIBIC Dump: Complex Declaration}
\end{figure}

Accompanied by complex type declaration, complex type casting is also allowed in
CIBIC. In the code above, an integer \texttt{0x12345678} is casted into a
pointer to a function with empty parameter list returning an integer, and assign
to function pointer f. Note that in order to deal with the form like
``\texttt{(int (*)())}'', syntax description in ``\texttt{cibic.y}'' is rewritten
according to the standard and made more general.

Function pointers are easy to implement in MIPS assembly. But their declarations
could be more complex and esoteric than we expect it to be. Although these
constructions are rarely used, the compilers that support function pointer are
supposted to understand them. For example, ``\texttt{int (*g(int
***e[10]))();}'' declares a function g which takes an array of pointers as
parameters and returns a function pointer. The code below demonstrates a
non-typical use of function pointer. It can be compiled correctly by CIBIC. When
x is non-zero, the program prints ``i'm f'' five times, otherwise, prints ``i'm
g''. Note that we make use of the language feature of C that the empty parameter
list means uncertain number of parameters, so that f and g can pass func itself
to the invocation of func.
\begin{listing}[H]
    \centering
    \RecustomVerbatimEnvironment{Verbatim}{BVerbatim}{}
    \begin{minted}{c}
typedef void (*Func_t)();
void f(Func_t func, int step) {
    if (!step) return;
    printf("i'm f\n");
    func(func, step - 1);
}
/* void (*func)() has the same meaning as Func_t func */
void g(void (*func)(), int step) {
    if (!step) return;
    printf("i'm g\n");
    func(func, step - 1);
}
int main() {
    void (*func)(void (*ifunc)(), int step);
    int x = 1;
    if (x) func = f;
    else func = g;
    func(func, 5);
    return 0;
}
    \end{minted}
    \caption {Self-reference function pointer}
\end{listing}
\section{Intermediate Representation}
A good IR (intermediate representation) should be both easy to modify (or
optimize) and convenient to be transformed into assembly. A typical bad design
is to make up an IR that totally resembles assembly instructions. This does not
make much sense because when IR almost looks like assembly, we actually do not
need IR at all, even though this makes it easier to translate. Moreover, if IR
to assembly is a ``one-to-one correspondence'', we can not benefit much from
the IR in the optimization, even suffer from debugging since one semantically
clear operation may be splitted into several confusing assembly-like IR
instructions.

In CIBIC, there are mainly three kinds of IR instructions: flow control,
assignment and calculation. For example, \texttt{BEQ}, \texttt{BNE},
\texttt{GOTO}, \texttt{RET}, and \texttt{CALL} are flow control instructions;
\texttt{ARR}, \texttt{WARR}, \texttt{MOVE} are assignment instructions;
\texttt{MUL}, \texttt{DIV}, \texttt{ADD}, \texttt{SUB}, etc. are calculation
instructions. There are also a few special types of instructions, like
\texttt{PUSH}, \texttt{LOAD}. \texttt{PUSH} indicates an argument is pushed to
the stack. \texttt{LOAD} is just a ``pseudo-instruction'', it is only designed
for helping the unification of SSA form because every variable needs to be
defined somewhere. So local variables are parameters are first ``loaded'' at the
beginning of a function. Therefore for some variables \texttt{LOAD} does not
need to be translated into a de facto load instruction (for example, spilled
variables). All kinds of instructions used in IR is shown in table.
\begin{center}
    \texttt {
    \begin{longtable}{|r|c|c|c|l|}
            OpCode  &  Dest.    &  Src. 1  &  Src. 2  &   Explanation \\  \hline
            BEQ     &  block    &  cond    &  val     &   if (cond == val) goto block \\
            BNE     &  block    &  cond    &  val     &   if (cond != val) goto block \\
            GOTO    &  block    &  NULL    &  NULL    &   goto block \\
            CALL    &  ret      &  func    &  NULL    &   ret = call func \\
            RET     &  NULL     &  ret     &  NULL    &   return ret \\  \hline
            PUSH    &  NULL     &  arg     &  NULL    &   push arg \\
            LOAD    &  var      &  NULL    &  NULL    &   load var \\  \hline
            ARR     &  var      &  arr     &  index   &   var = arr[index] \\
            WARR    &  arr      &  var     &  index   &   arr[index] = var \\
            MOVE    &  var1     &  var2    &  NULL    &   var1 = var2 \\  \hline
            ADDR    &  var1     &  var2    &  NULL    &   var1 = addr var2 \\
            MUL     &  res      &  var1    &  var2    &   res = var1 * var2 \\
            DIV     &  res      &  var1    &  var2    &   res = var1 / var2 \\
            MOD     &  res      &  var1    &  var2    &   res = var1 \% var2 \\
            ADD     &  res      &  var1    &  var2    &   res = var1 + var2 \\
            SUB     &  res      &  var1    &  var2    &   res = var1 - var2 \\
            SHL     &  res      &  var1    &  var2    &   res = var1 << var2 \\
            SHR     &  res      &  var1    &  var2    &   res = var1 >> var2 \\
            AND     &  res      &  var1    &  var2    &   res = var1 \& var2 \\
            XOR     &  res      &  var1    &  var2    &   res = var1 \^{} var2 \\
             OR     &  res      &  var1    &  var2    &   res = var1 | var2 \\
            NOR     &  res      &  var1    &  var2    &   res = var1 nor var2 \\
             EQ     &  res      &  var1    &  var2    &   res = var1 == var2 \\
             NE     &  res      &  var1    &  var2    &   res = var1 != var2 \\
             LT     &  res      &  var1    &  var2    &   res = var1 < var2 \\
             GT     &  res      &  var1    &  var2    &   res = var1 > var2 \\
             LE     &  res      &  var1    &  var2    &   res = var1 <= var2 \\
             GE     &  res      &  var1    &  var2    &   res = var1 >= var2 \\
            NEG     &  res      &  var1    &  NULL    &   res = -var1 \\
        \end{longtable}
    }
\end{center}

Here are some remarks:
\begin{enumerate}
    \item Because C standard requires shortcuit of ``\texttt{\&\&}'' and
        ``\texttt{||}'' operator, they are transformed into branches, will not
        be shown in the IR.

    \item Multi-dimensional arrays require a little more care. In C, all arrays
        are treated as one-dimensional array in the end. For instance,
        \texttt{a[2][3][4]} (\texttt{a} is declared as \texttt{int a[5][5][5]})
        is transformed into \texttt{*(a + 2 * 100 + 3 * 20 + 4 * 4)}, so its IR,
        for example could be:
\begin{center}
    \texttt{
        \begin{tabular}{l}
            t5 = a\_0 + 200 \\
            t3 = t5 + 60 \\
            b\_1 = t3[16] \\
        \end{tabular}
    }
\end{center}
\item Pointer dereference such as \texttt{a = *ptr} can be easily represented by
    \texttt{a = ptr[0]}.
\item Since structs and unions can not be entirely stored in a register, CIBIC
    regard a instance of a struct or union as a pointer to it. Since the memory
    offset of an attribute can be determined after semantics analysis, access to
    a struct (or union) can be represented in the same way as array:
    \begin{center}
        \begin{minipage}{0.3\textwidth}
            \begin{minted}{c}
struct A {
    struct B {
        int i, j;
    } b;
    int x, y;
} a, a2;
int main() {
    struct B b;
    a.b.i = 1;
    a.b.j = 2;
    a.x = 3;
    a.y = 4;
    a2.b = b;
    a.b = a2.b;
}
            \end{minted}
        \end{minipage}
        \begin{minipage}{0.3\textwidth}
            \begin{BVerbatim}
t1 = a_0 + 8
t1[4] = 1
t4 = t1
t4[0] = 2
a_0[4] = 3
a_0[0] = 4
a2_0[8] = b_0
t12 = a2_0 + 8
a_0[8] = t12
        \end{BVerbatim}
        \end{minipage}
    \end{center}
    The only problem left is the ambiguity. If we regard a instance of a struct
    as a pointer to it, how could we distinguish a struct copy assignment from
    a struct pointer assignment? The answer is: this is not an ambiguity at
    all, since all the type information of operands are preserved, we can
    easily tell the difference by looking at type information. So actually, the
    printed IR above does not contain all the information, it is just a human
    readable form to easy our debug. The underlying type information is
    preserved in IR data structure and passed to the translator so can be used
    to guide the final translation. Of course, the code above does produce
    correct result compiled by CIBIC.
\end{enumerate}

\begin{listing}[H]
    \centering
    \RecustomVerbatimEnvironment{Verbatim}{BVerbatim}{}
    \begin{minted}{c}
calc_dominance_frontier();
/* build SSA */
insert_phi(vars);
renaming_vars(oprs);
/* optimization on SSA */
const_propagation();
subexp_elimination();
const_propagation();
strength_reduction();
deadcode_elimination();
/* out of SSA */
mark_insts();
build_intervals();
register_alloc();
    \end{minted}
    \caption{Workflow of IR in CIBIC}
    \label{list:workflow}
\end{listing}

\subsection{Single Static Assignment Form}
CIBIC makes good use of SSA (Single Static Assignment) form. SSA form is a
property of an IR, which says that each variable is assigned \textbf{exactly
once}. The property can simplify the liveness analysis and optimization, since
all variables are assigned only once so the ``define-use'' relationship is much
clearer that the original IR.

However, it is not trivial to convert an IR to SSA form, mainly because of the
``merging issue''. In figure \ref{fig:phi_function}, the control flow branches
at the \texttt{if} statement and merges again when getting out of if. The
question is, should we use \texttt{x\_1} or \texttt{x\_2} in the \texttt{y = x
+ 1} statement? The answer is, it is only known at runtime. So a
``phi-function'' \texttt{x\_3 = phi(x\_1, x\_2)} will be inserted at the merge
point to wrap two possibilities.  Unfortunately, not only does this
circumstance appear in if statement, but also exist in loops. How do we know
where we should add an phi-function for certain variable? The answer is we can
just insert phi-functions at dominance frontiers in the control flow graph.
\subsection{Phi-functions}
There are several ways of computing dominance frontiers of a control flow
graph.  But they all first construct the dominator tree and then deduce the
frontiers.  Well-known algorithms for finding out the dominator tree are the
straightforward iterative algorithm and Lengauer-Tarjan algorithm. The improved
version on latter algorithm provides with a nearly linear time complexity $O(m
\alpha(m, n))$. However, according to the research \cite{tarjan06} done by L.
Georgiagids, R.  F. Werneck, R. E. Tarjan et al, practical performance of
iterative algorithm is acceptable and even better than sophisticated LT
algorithm. CIBIC adopts a variant \cite{cooper01} of the original iterative
algorithm. It is faster than LT algorithm on real programs and easy to
implement.
\begin{figure}
    \centering
    \begin{minipage}{0.4\textwidth}
        \includegraphics[scale=0.5]{phi1.png}
    \end{minipage}
    \begin{minipage}{0.4\textwidth}
        \includegraphics[scale=0.5]{phi2.png}
    \end{minipage}
    \caption{Before and After Inserting Phi-function}
    \label{fig:phi_function}
\end{figure}
\subsection{Register Allocation}
CIBIC uses linear scan register allocation \cite{moe02} to allocate registers
\emph{before translating out of SSA form}. This method is different from
traditional one and can make use of lifetime holes and instruction weights to
improve the quality of the allocation.

To run the allocation algorithm, we shall first compute live intervals.
Unfortunately, the pseudo-code provided in the paper \cite{moe02} does not take
the loop cases into account. In another paper \cite{wimmer10}, I found the
correct algorithm for coping with loops. Unfortunately, the pseudo-code in two
papers are written in different forms and have different concepts in some
details. So I learned from both of them and got my own algorithm (almost like
the one in paper \cite{moe02}). Since the graph is traversed \textbf{only
once}, the implementation is quite efficient (the traditional method for
non-SSA IR requires iteration thus the time complexity is not guaranteed).

In linear scan algorithm described in paper \cite{moe02}, we should maintain
four sets: unhandled, handled, active, inactive. However, for the
implementation, we do not need to maintain handled set because once a variable
is handled, we can just evict it from the current active or inactive set and
there is no need to put it again into another handled set. Besides, the
unhandled set can be implemented as a sorted array, because intervals are
picked in increasing order of their start points. Therefore, in CIBIC, only
active and inactive sets are maintained by double-linked lists with sentinel
nodes. The double-linked design makes it easy to remove an interval from the
set and the sentinel node helps us to move to another set conveniently. The
algorithm also requires a pre-calculated weight of each interval (computed from
the accesses to the interval) which helps to decide the spilled variable.
\section{Performance and Optimization}
\subsection{Compilation Speed}
CIBIC compiles very fast for all the given source code and some additional test
cases. It costs CIBIC 0.10 second to compile all 52 programs, while gcc spends
1.90 seconds to complete in the same testing environment. Although the test may
not be complete or fair (since gcc is a much more sophisticated compiler), the
result does show a decent and promising compilation speed.
\subsection{Compilation Quality}
In the final examination, the performance of MIPS assembly generated by CIBIC
surpasses the ones generated by most of my classmates' Java-implemented
compilers. It is worth mentioning that all of those few compilers that produces
better code are written in Java and none of them always generates better code
than CIBIC. Also, in the circumstances where the code generated by CIBIC is not
the best, the runtime difference between the best performance code and the one
by CIBIC is small. Moreover, as far as I know, those compiler make use of (not
fully implemented) inline optimization which may be very effective for some
special code. However, CIBIC does not implement any form of inline optimization
due to the limited time (To implement a decent inline optimization for an
SSA-based and C-implemented compiler requires some extra work). Finally, I
would like to argue that all test cases provided by teaching assistants may not
be complete or convincing. The major design flaw is all functions are very
short and only have one or two nested \texttt{if} statement or loops (except
test case: ``superloop''). Therefore, the advantage of SSA (especially the
advanced register allocation algorithm) is not shown very well.

\begin{figure}[H]
    \centering
    \includegraphics[scale=0.5]{all.png}
\end{figure}
\begin{figure}[H]
    \centering
    \includegraphics[scale=0.5]{all2.png}
\end{figure}
\begin{figure}[H]
    \centering
    \includegraphics[scale=0.5]{all3.png}
\end{figure}

The above figures show the ``normalized instruction'' for each compiler
implementation with respect to different test cases. The ``normalized
instruction'' is obtained by dividing the number of instructions by the best
for the same test case, thus the lower, the better. There are 40 test cases
taken into account. Some good implementations are picked out and bolded in the
figures, while others are simply colored gray.

Although CIBIC does not generate the best code in some test cases, the overall
code quality is very good. Moreover, CIBIC has not implemented inline
optimization yet. As I know, compilers that are presented in bolded lines in
the second and the third figure all implement inline optimization, and before
the optimization, their performance are not as good as CIBIC. Therefore CIBIC
could do much more better.

\subsection{Optimization}
CIBIC implements the following optimizations: 
\begin{itemize}
    \item constant propagation
    \item dead code elimination
    \item strength reduction
    \item common subexpression elimination
\end{itemize}

Unfortunately, these optimization do not seem to be very effective to the
given test cases (although some may be effective to some specific test cases).
However, the assembly level optimization does have a considerable effect. Some
important techniques are:
\begin{itemize}
    \item reduce the use of \texttt{seq} and \texttt{sne} by fusing them with
        subsequent \texttt{beq} or \texttt{bne}
    \item reduce the load of immediates
    \item fuse the short-circuit operators and subsequent branch together (on
        the one hand it can reduce one or two extra jumps, on the other hand,
        the dominance of CFG may change so that common subexpression
        optimization can optimize more)
\end{itemize}

We are not going to discuss optimization effectiveness in detail because all
optimizations used by CIBIC are traditional. Instead, code snippets are
presented and explained below to reveal \textbf{inappropriate naive
implementation of some optimizations may be wrong}.

Constant propagation could be safely applied if there were not any indirect
modification to the propagated variable. Unfortunately, this is not always true.
A typical indirection modification is via calling a function which modifies a
global variable. Because of the function call, the modification is hidden from
the compiler. Another common indirect modification is done via a pointer. A
direct approach to get over these problem is to prevent the propagation over a
function call and to mark those variable whose address is being referenced (by
``\texttt{\&}'' operator) to disable the propagation of their value.

The same things may also happen in common subexpression elimination. The code
below demonstrates two typical situations in which the aggressive elimination
may produce a wrong result.
\begin{figure}[H]
    \begin{minipage}{0.4\textwidth}
        \RecustomVerbatimEnvironment{Verbatim}{BVerbatim}{}
    \centering
        \begin{minted}{c}
int N = 0;
void f() { N = 2; }
int main() {
    int a, b;
    a = N + 1;
    f();
    b = N + 1;
    printf("%d\n", b); /* 3 */
}
        \end{minted}
    \end{minipage}
    %    \centering
    \begin{minipage}{0.5\textwidth}
        \RecustomVerbatimEnvironment{Verbatim}{BVerbatim}{}
        \centering
\begin{minted}{c}
    int flag = 0;
    int check(int x, int y) {
        return x > 0 && y > 0 && (flag ^= 1);
    }
    int main() {
        int x = 1, y = 2;
        printf("%d\n", check(x, y)); /* 1 */
        printf("%d\n", check(x, y)); /* 0 */
        printf("%d\n", check(x, y)); /* 1 */
    }
\end{minted}
    \end{minipage}
    \label{fig:cse}
\end{figure}
Global common subexpression elimination should be performed while walking down
the dominator tree \cite{sassa07}. Luckily, The dominator tree can be easily
obtained because it is an important by-product of SSA construction. 

In CIBIC, all optimizations need to be performed only once or twice as shown in
listing \ref{list:workflow}. Because the implemented optimizations do not
require iteration.

\section{Conclusion}
CIBIC is a simple but efficient C compiler implementation in C. It supports key
characteristics of C language such as structs/unions, arrays, and various type
of pointers. \texttt{typedef}, a very useful syntactic sugar is also
implemented in CIBIC. Furthermore, CIBIC makes use of SSA to generate code of
decent quality. As a lightweight compiler, it compiles much faster than gcc
(gcc implements a lot more features though).

Some standard syntax and features written in C are not supported by CIBIC:
\texttt{enum}, \texttt{switch}, linking, preprocessing (such as
\texttt{\#define}), etc. However, it is very possible to extend CIBIC to
support these features. Some, for example \texttt{switch} statements are easy
to be added.
\begin{thebibliography}{9}
    \bibitem{grune12}
        Grune, Dick, et al. Modern Compiler Design. Springer, 2012.
    \bibitem{appel97}
        Appel, Andrew W. and Ginsburg, Maia. Modern Compiler Implementation in
        C. Cambridge University Press, 1997.
    \bibitem{tarjan06}
        Georgiadis, Loukas, Robert Endre Tarjan, and Renato Fonseca F. Werneck.
        ``Finding Dominators in Practice.'' J. Graph Algorithms Appl. 10.1
        (2006): 69-94.
    \bibitem{cooper01}
        Cooper, Keith D., Timothy J. Harvey, and Ken Kennedy. ``A simple, fast
        dominance algorithm.'' Software Practice \& Experience 4 (2001): 1-10.
    \bibitem{moe02}
        M\"ossenb\:ock, Hanspeter, and Michael Pfeiffer. ``Linear scan register
        allocation in the context of SSA form and register constraints.''
        Compiler Construction.  Springer Berlin Heidelberg, 2002.
    \bibitem{wimmer10}
        Wimmer, Christian, and Michael Franz. ``Linear scan register allocation
        on SSA form.'' Proceedings of the 8th annual IEEE/ACM international
        symposium on Code generation and optimization. ACM, 2010. 
    \bibitem{sassa07}
        Tokyo Institute of Technology. Optimization in Static Single Assignment
        Form
\end{thebibliography}
\end{document}