[pypy-commit] extradoc extradoc: more in-depth discussion of related work

cfbolz Tue, 14 Aug 2012 08:38:18 -0700

Author: Carl Friedrich Bolz <[email protected]>
Branch: extradoc
Changeset: r4570:d3fc033b4584
Date: 2012-08-14 17:37 +0200
http://bitbucket.org/pypy/extradoc/changeset/d3fc033b4584/


Log:    more in-depth discussion of related work

diff --git a/talk/dls2012/paper.bib b/talk/dls2012/paper.bib
--- a/talk/dls2012/paper.bib
+++ b/talk/dls2012/paper.bib
@@ -12,7 +12,7 @@
        year = {1984}
 },
 
-@inproceedings{carl_friedrich_bolz_towards_2010,
+@inproceedings{bolz_towards_2010,
        address = {Hagenberg, Austria},
        title = {Towards a Jitting {VM} for Prolog execution},
        isbn = {978-1-4503-0132-9},
@@ -21,7 +21,7 @@
        abstract = {Most Prolog implementations are implemented in low-level 
languages such as C and are based on a variation of the {WAM} instruction set, 
which enhances their performance but makes them hard to write. In addition, 
many of the more dynamic features of Prolog (like assert), despite their 
popularity, are not well supported. We present a high-level continuation-based 
Prolog interpreter based on the {PyPy} project. The {PyPy} project makes it 
possible to easily and efficiently implement dynamic languages. It provides 
tools that automatically generate a just-in-time compiler for a given 
interpreter of the target language, by using partial evaluation techniques. The 
resulting Prolog implementation is surprisingly efficient: it clearly 
outperforms existing interpreters of Prolog in high-level languages such as 
Java. Moreover, on some benchmarks, our system outperforms state-of-the-art 
{WAM-based} Prolog implementations. Our paper aims to show that declarative 
languages such as Pr
 olog can indeed benefit from having a just-in-time compiler and that {PyPy} 
can form the basis for implementing programming languages other than Python.},
        booktitle = {{PPDP}},
        publisher = {{ACM}},
-       author = {Carl Friedrich Bolz and Michael Leuschel and David Schneider},
+       author = {Bolz, Carl Friedrich and Leuschel, Michael and Schneider, 
David},
        year = {2010},
        keywords = {interpreters, jit, logic programming, partial evaluation}
 },
@@ -57,8 +57,25 @@
        keywords = {code generation, design, dynamically typed languages, 
experimentation, incremental compilers, languages, measurement, performance, 
run-time environments, trace-based compilation}
 },
 
+@inproceedings{kotzmann_escape_2005,
+       address = {New York, {NY}, {USA}},
+       series = {{VEE} '05},
+       title = {Escape analysis in the context of dynamic compilation and 
deoptimization},
+       isbn = {1-59593-047-7},
+       location = {Chicago, {IL}, {USA}},
+       doi = {10.1145/1064979.1064996},
+       abstract = {In object-oriented programming languages, an object is said 
to escape the method or thread in which it was created if it can also be 
accessed by other methods or threads. Knowing which objects do not escape 
allows a compiler to perform aggressive {optimizations.This} paper presents a 
new intraprocedural and interprocedural algorithm for escape analysis in the 
context of dynamic compilation where the compiler has to cope with dynamic 
class loading and deoptimization. It was implemented for Sun Microsystems' Java 
{HotSpot&#8482;} client compiler and operates on an intermediate representation 
in {SSA} form. We introduce equi-escape sets for the efficient propagation of 
escape information between related objects. The analysis is used for scalar 
replacement of fields and synchronization removal, as well as for stack 
allocation of objects and fixed-sized arrays. The results of the 
interprocedural analysis support the compiler in inlining decisions and allow 
actual parameters 
 to be allocated on the caller {stack.Under} certain circumstances, the Java 
{HotSpot&#8482;} {VM} is forced to stop executing a method's machine code and 
transfer control to the interpreter. This is called deoptimization. Since the 
interpreter does not know about the scalar replacement and synchronization 
removal performed by the compiler, the deoptimization framework was extended to 
reallocate and relock objects on demand.},
+       booktitle = {Proceedings of the 1st {ACM/USENIX} international 
conference on Virtual execution environments},
+       publisher = {{ACM}},
+       author = {Kotzmann, Thomas and M&#246;ssenb&#246;ck, Hanspeter},
+       year = {2005},
+       note = {{ACM} {ID:} 1064996},
+       keywords = {algorithms, allocation/deallocation strategies, 
deoptimization},
+       pages = {111&#8211;120}
+},
+
 @inproceedings{bolz_towards_2009,
-       title = {Towards {Just-In-Time} Partial Evaluation of Prolog},
+       title = {Towards Just-In-Time Partial Evaluation of Prolog},
        doi = {10.1007/978-3-642-12592-8_12},
        booktitle = {Logic Program Synthesis and Transformation},
        author = {Bolz, Carl Friedrich and Leuschel, Michael and Rigo, Armin},
@@ -82,7 +99,7 @@
        location = {Barcelona, Spain},
        url = {http://dx.doi.org/10.1109/MICRO.2005.22},
        doi = {http://dx.doi.org/10.1109/MICRO.2005.22},
-       abstract = {The performance of a dynamic optimization system depends 
heavily on the code it selects to optimize. Many current systems follow the 
design of {HP} Dynamo and select a single interprocedural path, or trace, as 
the unit of code optimization and code caching. Though this approach to region 
selection has worked well in practice, we show that it is possible to adapt 
this basic approach to produce regions with greater locality, less needless 
code duplication, and fewer profiling counters. In particular, we propose two 
new region-selection algorithms and evaluate them against Dynamo&#191;s 
selection mechanism, {Next-Executing} Tail {(NET).} Our first algorithm, 
{Last-Executed} Iteration {(LEI)}, identifies cyclic paths of execution better 
than {NET}, improving locality of execution while reducing the size of the code 
cache. Our second algorithm allows overlapping traces of similar execution 
frequency to be combined into a single large region. This second technique can 
be appl
 ied to both {NET} and {LEI}, and we find that it significantly improves 
metrics of locality and memory overhead for each.},
+       abstract = {The performance of a dynamic optimization system depends 
heavily on the code it selects to optimize. Many current systems follow the 
design of {HP} Dynamo and select a single interprocedural path, or trace, as 
the unit of code optimization and code caching. Though this approach to region 
selection has worked well in practice, we show that it is possible to adapt 
this basic approach to produce regions with greater locality, less needless 
code duplication, and fewer profiling counters. In particular, we propose two 
new region-selection algorithms and evaluate them against Dynamo&#191;s 
selection mechanism, Next-Executing Tail {(NET).} Our first algorithm, 
Last-Executed Iteration {(LEI)}, identifies cyclic paths of execution better 
than {NET}, improving locality of execution while reducing the size of the code 
cache. Our second algorithm allows overlapping traces of similar execution 
frequency to be combined into a single large region. This second technique can 
be applied 
 to both {NET} and {LEI}, and we find that it significantly improves metrics of 
locality and memory overhead for each.},
        journal = {Proceedings of the 38th annual {IEEE/ACM} International 
Symposium on Microarchitecture},
        author = {Hiniker, David and Hazelwood, Kim and Smith, Michael D},
        year = {2005},
@@ -102,8 +119,7 @@
 
 @misc{pall_luajit_2009,
        title = {{LuaJIT} 2.0 intellectual property disclosure and research 
opportunities},
-       note = {http://lua-users.org/lists/lua-l/2009-11/msg00089.html (accessed
-        June 2011)},
+       url = {http://lua-users.org/lists/lua-l/2009-11/msg00089.html},
        author = {Pall, Mike},
        month = nov,
        year = {2009}
@@ -111,7 +127,7 @@
 
 @inproceedings{bolz_runtime_2011,
        address = {Lancaster, {UK}},
-       title = {Runtime Feedback in a {Meta-Tracing} {JIT} for Efficient 
Dynamic Languages},
+       title = {Runtime Feedback in a Meta-Tracing {JIT} for Efficient Dynamic 
Languages},
        abstract = {Meta-tracing {JIT} compilers can be applied to a variety of 
different languages without explicitly encoding language semantics into the 
compiler. So far, they lacked a way to give the language implementor control 
over runtime feedback. This restricted their performance. In this paper we 
describe the mechanisms in {PyPy&#8217;s} meta-tracing {JIT} that can be used 
to control runtime feedback in language-specific ways. These mechanisms are 
flexible enough to express classical {VM} techniques such as maps and runtime 
type feedback.},
        booktitle = {{ICOOOLPS}},
        publisher = {{ACM}},
@@ -135,7 +151,115 @@
        pages = {71--80}
 },
 
-@inproceedings{davide_ancona_rpython:_2007,
+@article{diwan_type-based_1998,
+       title = {Type-based alias analysis},
+       volume = {33},
+       issn = {0362-1340},
+       url = {http://doi.acm.org/10.1145/277652.277670},
+       doi = {10.1145/277652.277670},
+       abstract = {This paper evaluates three alias analyses based on 
programming language types. The first analysis uses type compatibility to 
determine aliases. The second extends the first by using additional high-level 
information such as field names. The third extends the second with a 
flow-insensitive analysis. Although other researchers suggests using types to 
disambiguate memory references, none evaluates its effectiveness. We perform 
both static and dynamic evaluations of type-based alias analyses for Modula-3, 
a statically-typed type-safe language. The static analysis reveals that type 
compatibility alone yields a very imprecise alias analysis, but the other two 
analyses significantly improve alias precision. We use redundant load 
elimination {(RLE)} to demonstrate the effectiveness of the three alias 
algorithms in terms of the opportunities for optimization, the impact on 
simulated execution times, and to compute an upper bound on what a perfect 
alias analysis would yield. We s
 how modest dynamic improvements for {(RLE)}, and more surprisingly, that on 
average our alias analysis is within 2.5\% of a perfect alias analysis with 
respect to {RLE} on 8 Modula-3 programs. These results illustrate that to 
explore thoroughly the effectiveness of alias analyses, researchers need 
static, dynamic, and upper-bound analysis. In addition, we show that for 
type-safe languages like Modula-3 and Java, a fast and simple alias analysis 
may be sufficient for many applications.},
+       number = {5},
+       journal = {{SIGPLAN} Not.},
+       author = {Diwan, Amer and {McKinley}, Kathryn S. and Moss, J. Eliot B.},
+       month = may,
+       year = {1998},
+       pages = {106&#8211;117}
+},
+
+@inproceedings{cytron_code_1986,
+       address = {New York, {NY}, {USA}},
+       series = {{POPL} '86},
+       title = {Code motion of control structures in high-level languages},
+       url = {http://doi.acm.org/10.1145/512644.512651},
+       doi = {10.1145/512644.512651},
+       abstract = {One trend among programmers is the increased use of 
abstractions. Through encapsulation techniques, abstractions extend the 
repertory of data structures and their concomitant operations that are 
processed directly by a compiler. For example, a compiler might not offer sets 
or set operations in its base language, but abstractions allow a programmer to 
define sets in terms of constructs already recognized by the compiler. In 
particular, abstractions can allow new constructs to be defined in terms of 
other abstractions. Although significant power is gained through the use of 
layered abstractions, object code quality suffers as increasingly less of a 
program's data structures and operations are exposed to the optimization phase 
of a compiler. Multiple references to abstractions are also inefficient, since 
the interaction between abstractions is often complex yet hidden from a 
compiler. Abstractions are most flexible when they are cast in general terms; a 
specific invocation
  is then tailored by the abstraction to obtain the appropriate code. A 
sequence of references to such abstractions can be inefficient due to 
functional redundancy that cannot be detected at compile-time. By integrating 
the references, the offending segments of code can be moved to a more 
advantageous position. Although procedure integration materializes abstracted 
constructs, the abstractions can still be ineligible for optimization using 
current techniques; in particular, abstractions often involve loops and 
conditional branches that can obscure code that would otherwise be eligible for 
code motion.},
+       booktitle = {Proceedings of the 13th {ACM} {SIGACT-SIGPLAN} symposium 
on Principles of programming languages},
+       publisher = {{ACM}},
+       author = {Cytron, Ron and Lowry, Andy and Zadeck, F. Kenneth},
+       year = {1986},
+       pages = {70&#8211;85}
+},
+
+@article{knoop_lazy_1992,
+       title = {Lazy code motion},
+       volume = {27},
+       issn = {0362-1340},
+       url = {http://doi.acm.org/10.1145/143103.143136},
+       doi = {10.1145/143103.143136},
+       abstract = {We present a bit-vector algorithm for the optimal and 
economical placement of computations within flow graphs, which is as efficient 
as standard uni-directional analyses. The point of our algorithm is the 
decomposition of the bi-directional structure of the known placement algorithms 
into a sequence of a backward and a forward analysis, which directly implies 
the efficiency result. Moreover, the new compositional structure opens the 
algorithm for modification: two further uni-directional analysis components 
exclude any unnecessary code motion. This laziness of our algorithm minimizes 
the register pressure, which has drastic effects on the run-time behaviour of 
the optimized programs in practice, where an economical use of registers is 
essential.},
+       number = {7},
+       journal = {{SIGPLAN} Not.},
+       author = {Knoop, Jens and R&#252;thing, Oliver and Steffen, Bernhard},
+       month = jul,
+       year = {1992},
+       pages = {224&#8211;234}
+},
+
+@incollection{allen_catalogue_1971,
+       title = {A Catalogue of Optimizing Transformations, ed. R. Rustin},
+       booktitle = {Design and Optimization of Compilers},
+       publisher = {Prentice-Hall},
+       author = {Allen, Frances and Cocke, John},
+       editor = {Rustin, Randall},
+       year = {1971},
+       pages = {1--30}
+},
+
+@inproceedings{chow_new_1997,
+       address = {New York, {NY}, {USA}},
+       series = {{PLDI} '97},
+       title = {A new algorithm for partial redundancy elimination based on 
{SSA} form},
+       isbn = {0-89791-907-6},
+       url = {http://doi.acm.org/10.1145/258915.258940},
+       doi = {10.1145/258915.258940},
+       abstract = {A new algorithm, {SSAPRE}, for performing partial 
redundancy elimination based entirely on {SSA} form is presented. It achieves 
optimal code motion similar to lazy code motion {[KRS94a}, {DS93]}, but is 
formulated independently and does not involve iterative data flow analysis and 
bit vectors in its solution. It not only exhibits the characteristics common to 
other sparse approaches, but also inherits the advantages shared by other 
{SSA-based} optimization techniques. {SSAPRE} also maintains its output in the 
same {SSA} form as its input. In describing the algorithm, we state theorems 
with proofs giving our claims about {SSAPRE.} We also give additional 
description about our practical implementation of {SSAPRE}, and analyze and 
compare its performance with a bit-vector-based implementation of {PRE.} We 
conclude with some discussion of the implications of this work.},
+       booktitle = {Proceedings of the {ACM} {SIGPLAN} 1997 conference on 
Programming language design and implementation},
+       publisher = {{ACM}},
+       author = {Chow, Fred and Chan, Sun and Kennedy, Robert and Liu, 
Shin-Ming and Lo, Raymond and Tu, Peng},
+       year = {1997},
+       pages = {273&#8211;286}
+},
+
+@article{morel_global_1979,
+       title = {Global optimization by suppression of partial redundancies},
+       volume = {22},
+       issn = {0001-0782},
+       url = {http://doi.acm.org/10.1145/359060.359069},
+       doi = {10.1145/359060.359069},
+       abstract = {The elimination of redundant computations and the moving of 
invariant computations out of loops are often done separately, with invariants 
moved outward loop by loop. We propose to do both at once and to move each 
expression directly to the entrance of the outermost loop in which it is 
invariant. This is done by solving a more general problem, i.e. the elimination 
of computations performed twice on a given execution path. Such computations 
are termed partially redundant. Moreover, the algorithm does not require any 
graphical information or restrictions on the shape of the program graph. 
Testing this algorithm has shown that its execution cost is nearly linear with 
the size of the program, and that it leads to a smaller optimizer that requires 
less execution time.},
+       number = {2},
+       journal = {Commun. {ACM}},
+       author = {Morel, E. and Renvoise, C.},
+       month = feb,
+       year = {1979},
+       keywords = {Boolean systems, compilation, compiler, data flow analysis, 
invariant computation elimination, optimization, optimizer, partial redundancy, 
redundancy elimination},
+       pages = {96&#8211;103}
+},
+
+@article{dhamdhere_practical_1991,
+       title = {Practical adaption of the global optimization algorithm of 
Morel and Renvoise},
+       volume = {13},
+       issn = {0164-0925},
+       url = {http://doi.acm.org/10.1145/103135.214520},
+       doi = {10.1145/103135.214520},
+       number = {2},
+       journal = {{ACM} Trans. Program. Lang. Syst.},
+       author = {Dhamdhere, D. M.},
+       month = apr,
+       year = {1991},
+       pages = {291&#8211;294}
+},
+
+@phdthesis{chow_portable_1984,
+       address = {Stanford, {CA}, {USA}},
+       title = {A portable machine-independent global optimizer&#8211;design 
and measurements},
+       school = {Stanford University},
+       author = {Chow, Frederick Chi-Tak},
+       year = {1984},
+       note = {{AAI8408268}}
+},
+
+@inproceedings{ancona_rpython:_2007,
        address = {Montreal, Quebec, Canada},
        title = {{RPython:} a step towards reconciling dynamically and 
statically typed {OO} languages},
        isbn = {978-1-59593-868-8},
@@ -145,17 +269,17 @@
        abstract = {Although the C-based interpreter of Python is reasonably 
fast, implementations on the {CLI} or the {JVM} platforms offers some 
advantages in terms of robustness and interoperability. Unfortunately, because 
the {CLI} and {JVM} are primarily designed to execute statically typed, 
object-oriented languages, most dynamic language implementations cannot use the 
native bytecodes for common operations like method calls and exception 
handling; as a result, they are not able to take full advantage of the power 
offered by the {CLI} and {JVM.}},
        booktitle = {{DLS}},
        publisher = {{ACM}},
-       author = {Davide Ancona and Massimo Ancona and Antonio Cuni and 
Nicholas D. Matsakis},
+       author = {Ancona, Davide and Ancona, Massimo and Cuni, Antonio and 
Matsakis, Nicholas D.},
        year = {2007},
        keywords = {{JVM}, .net, Python}
 },
 
 @article{futamura_partial_1999,
-       title = {Partial Evaluation of Computation Process - An Approach to a 
{Compiler-Compiler}},
+       title = {Partial Evaluation of Computation Process - An Approach to a 
Compiler-Compiler},
        volume = {12},
        url = {http://citeseer.ist.psu.edu/futamura99partial.html},
        number = {4},
-       journal = {{Higher-Order} and Symbolic Computation},
+       journal = {Higher-Order and Symbolic Computation},
        author = {Futamura, Yoshihiko},
        year = {1999},
        keywords = {Futamura},
@@ -167,31 +291,31 @@
        isbn = {0-13-020249-5},
        url = {http://portal.acm.org/citation.cfm?id=153676},
        abstract = {This book is out of print. For copies, Please refer to the 
following online page},
-       publisher = {{Prentice-Hall}},
+       publisher = {Prentice-Hall},
        author = {Jones, Neil D. and Gomard, Carsten K. and Sestoft, Peter},
        year = {1993}
 },
 
-@inproceedings{armin_rigo_pypys_2006,
+@inproceedings{rigo_pypys_2006,
        address = {Portland, Oregon, {USA}},
        title = {{PyPy's} approach to virtual machine construction},
-       isbn = {{1-59593-491-X}},
+       isbn = {1-59593-491-X},
        url = {http://portal.acm.org/citation.cfm?id=1176753},
        doi = {10.1145/1176617.1176753},
        abstract = {The {PyPy} project seeks to prove both on a research and a 
practical level the feasibility of constructing a virtual machine {(VM)} for a 
dynamic language in a dynamic language - in this case, Python. The aim is to 
translate (i.e. compile) the {VM} to arbitrary target environments, ranging in 
level from {C/Posix} to {Smalltalk/Squeak} via Java and {CLI/.NET}, while still 
being of reasonable efficiency within these {environments.A} key tool to 
achieve this goal is the systematic reuse of the Python language as a system 
programming language at various levels of our architecture and translation 
process. For each level, we design a corresponding type system and apply a 
generic type inference engine - for example, the garbage collector is written 
in a style that manipulates simulated pointer and address objects, and when 
translated to C these operations become C-level pointer and address 
instructions.},
        booktitle = {{DLS}},
        publisher = {{ACM}},
-       author = {Armin Rigo and Samuele Pedroni},
+       author = {Rigo, Armin and Pedroni, Samuele},
        year = {2006},
        keywords = {metacircularity, Python, retargettable code generation, 
type inference, {VM}}
 },
 
 @article{georges_statistically_2007,
-       title = {Statistically rigorous java performance evaluation},
+       title = {Statistically rigorous Java performance evaluation},
        volume = {42},
        url = {http://portal.acm.org/citation.cfm?id=1297105.1297033},
        doi = {10.1145/1297105.1297033},
-       abstract = {Java performance is far from being trivial to benchmark 
because it is affected by various factors such as the Java application, its 
input, the virtual machine, the garbage collector, the heap size, etc. In 
addition, non-determinism at run-time causes the execution time of a Java 
program to differ from run to run. There are a number of sources of 
non-determinism such as {Just-In-Time} {(JIT)} compilation and optimization in 
the virtual machine {(VM)} driven by timer-based method sampling, thread 
scheduling, garbage collection, and various.},
+       abstract = {Java performance is far from being trivial to benchmark 
because it is affected by various factors such as the Java application, its 
input, the virtual machine, the garbage collector, the heap size, etc. In 
addition, non-determinism at run-time causes the execution time of a Java 
program to differ from run to run. There are a number of sources of 
non-determinism such as Just-In-Time {(JIT)} compilation and optimization in 
the virtual machine {(VM)} driven by timer-based method sampling, thread 
scheduling, garbage collection, and various.},
        number = {10},
        journal = {{SIGPLAN} Notices},
        author = {Georges, Andy and Buytaert, Dries and Eeckhout, Lieven},
@@ -228,12 +352,12 @@
        pages = {1--12}
 },
 
-@techreport{andreas_gal_incremental_2006,
+@techreport{gal_incremental_2006,
        title = {Incremental Dynamic Code Generation with Trace Trees},
        abstract = {The unit of compilation for traditional just-in-time 
compilers is the method. We have explored trace-based compilation, in which the 
unit of compilation is a loop, potentially spanning multiple methods and even 
library code. Using a new intermediate representation that is discovered and 
updated lazily on-demand while the program is being executed, our compiler 
generates code that is competitive with traditional dynamic compilers, but that 
uses only a fraction of the compile time and memory footprint.},
        number = {{ICS-TR-06-16}},
        institution = {Donald Bren School of Information and Computer Science, 
University of California, Irvine},
-       author = {Andreas Gal and Michael Franz},
+       author = {Gal, Andreas and Franz, Michael},
        month = nov,
        year = {2006},
        pages = {11}
@@ -254,19 +378,19 @@
        keywords = {dynamic compilation, embedded, software trace scheduling, 
{SSA}, {VM}}
 },
 
-@inproceedings{mario_wolczko_towards_1999,
-       title = {Towards a Universal Implementation Substrate for 
{Object-Oriented} Languages},
+@inproceedings{wolczko_towards_1999,
+       title = {Towards a Universal Implementation Substrate for 
Object-Oriented Languages},
        abstract = {Self is a minimalist object-oriented language with a 
sophisticated implementation that utilizes adaptive optimization. We have built 
implementations of Smalltalk and Java by translation to Self. These 
implementations were much easier to construct in Self than by conventional 
means, and perform surprisingly well (competitively with conventional, 
commercial implementations). This leads us to believe that a Self-like system 
may form the basis of a universal substrate for implementation of 
object-oriented languages.},
        booktitle = {{OOPSLA} workshop on Simplicity, Performance, and 
Portability in Virtual Machine Design},
-       author = {Mario Wolczko and Ole Agesen and David Ungar},
+       author = {Wolczko, Mario and Agesen, Ole and {{David} Ungar}},
        year = {1999},
        keywords = {fixme}
 },
 
-@inproceedings{hoelzle_optimizing_1994,
+@inproceedings{holzle_optimizing_1994,
        address = {Orlando, Florida, United States},
        title = {Optimizing dynamically-dispatched calls with run-time type 
feedback},
-       isbn = {{0-89791-662-X}},
+       isbn = {0-89791-662-X},
        url = {http://portal.acm.org/citation.cfm?id=178243.178478},
        doi = {10.1145/178243.178478},
        abstract = {Note: {OCR} errors may be found in this Reference List 
extracted from the full text article. {ACM} has opted to expose the complete 
List rather than only correct and linked references.},
@@ -306,17 +430,17 @@
        doi = {10.1145/74878.74884},
        abstract = {We have developed and implemented techniques that double 
the performance of dynamically-typed object-oriented languages. Our {SELF} 
implementation runs twice as fast as the fastest Smalltalk implementation, 
despite {SELF's} lack of classes and explicit variables. To compensate for the 
absence of classes, our system uses implementation-level maps to transparently 
group objects cloned from the same prototype, providing data type information 
and eliminating the apparent space overhead for prototype-based systems. To 
compensate for dynamic typing, user-defined control structures, and the lack of 
explicit variables, our system dynamically compiles multiple versions of a 
source method, each customized according to its receiver's map. Within each 
version the type of the receiver is fixed, and thus the compiler can statically 
bind and inline all messages sent to self. Message splitting and type 
prediction extract and preserve even more static type information, allowing the 
comp
 iler to inline many other messages. Inlining dramatically improves performance 
and eliminates the need to hard-wire low-level methods such as +,==, and 
{ifTrue:.} Despite inlining and other optimizations, our system still supports 
interactive programming environments. The system traverses internal dependency 
lists to invalidate all compiled methods affected by a programming change. The 
debugger reconstructs inlined stack frames from compiler-generated debugging 
information, making inlining invisible to the {SELF} programmer.},
        booktitle = {{OOPSLA}},
-       author = {Chambers, C. and Ungar, D. and E. Lee},
+       author = {Chambers, C. and Ungar, D. and {{E.} Lee}},
        year = {1989},
        keywords = {self, specialization}
 },
 
-@inproceedings{hoelzle_optimizing_1991,
-       title = {Optimizing {Dynamically-Typed} {Object-Oriented} Languages 
With Polymorphic Inline Caches},
+@inproceedings{holzle_optimizing_1991,
+       title = {Optimizing Dynamically-Typed Object-Oriented Languages With 
Polymorphic Inline Caches},
        isbn = {3-540-54262-0},
        url = {http://portal.acm.org/citation.cfm?id=679193&dl=ACM&coll=portal},
        booktitle = {{ECOOP}},
-       publisher = {{Springer-Verlag}},
+       publisher = {Springer-Verlag},
        author = {H&#246;lzle, Urs and Chambers, Craig and Ungar, David},
        year = {1991}
 },
@@ -346,23 +470,4 @@
        publisher = {{ACM}},
        author = {Sullivan, Gregory T. and Bruening, Derek L. and Baron, Iris 
and Garnett, Timothy and Amarasinghe, Saman},
        year = {2003}
-}
-
-@inproceedings{kotzmann_escape_2005,
-       address = {New York, {NY}, {USA}},
-       series = {{VEE} '05},
-       title = {Escape analysis in the context of dynamic compilation and 
deoptimization},
-       isbn = {1-59593-047-7},
-       location = {Chicago, {IL}, {USA}},
-       doi = {10.1145/1064979.1064996},
-       abstract = {In object-oriented programming languages, an object is said 
to escape the method or thread in which it was created if it can also be 
accessed by other methods or threads. Knowing which objects do not escape 
allows a compiler to perform aggressive {optimizations.This} paper presents a 
new intraprocedural and interprocedural algorithm for escape analysis in the 
context of dynamic compilation where the compiler has to cope with dynamic 
class loading and deoptimization. It was implemented for Sun Microsystems' Java 
{HotSpot&#8482;} client compiler and operates on an intermediate representation 
in {SSA} form. We introduce equi-escape sets for the efficient propagation of 
escape information between related objects. The analysis is used for scalar 
replacement of fields and synchronization removal, as well as for stack 
allocation of objects and fixed-sized arrays. The results of the 
interprocedural analysis support the compiler in inlining decisions and allow 
actual parameters 
 to be allocated on the caller {stack.Under} certain circumstances, the Java 
{HotSpot&#8482;} {VM} is forced to stop executing a method's machine code and 
transfer control to the interpreter. This is called deoptimization. Since the 
interpreter does not know about the scalar replacement and synchronization 
removal performed by the compiler, the deoptimization framework was extended to 
reallocate and relock objects on demand.},
-       booktitle = {Proceedings of the 1st {ACM/USENIX} international 
conference on Virtual execution environments},
-       publisher = {{ACM}},
-       author = {Kotzmann, Thomas and M&#246;ssenb&#246;ck, Hanspeter},
-       year = {2005},
-       note = {{ACM} {ID:} 1064996},
-       keywords = {algorithms, allocation/deallocation strategies, 
deoptimization},
-       pages = {111&#8211;120}
-},
-
-
+}
\ No newline at end of file
diff --git a/talk/dls2012/paper.tex b/talk/dls2012/paper.tex
--- a/talk/dls2012/paper.tex
+++ b/talk/dls2012/paper.tex
@@ -208,8 +208,8 @@
 
 The work described in this paper was done in the context of the PyPy
 project.\footnote{\texttt{http://pypy.org}} PyPy is a framework for 
implementing
-dynamic languages efficiently~\cite{armin_rigo_pypys_2006}. When implementing a
-language with PyPy, one writes an interpreter for the language in 
RPython~\cite{davide_ancona_rpython:_2007}.
+dynamic languages efficiently~\cite{rigo_pypys_2006}. When implementing a
+language with PyPy, one writes an interpreter for the language in 
RPython~\cite{ancona_rpython:_2007}.
 RPython (``Restricted Python``) is a subset
 of Python chosen in such a way that it can be efficiently translated to a
 C-based VM by performing type inference.
@@ -672,8 +672,8 @@
 can be reused for all other appearances. RPython's optimizers can also remove
 repeated heap reads if the intermediate operations cannot have changed their
 value.\footnote{We perform a type-based alias analysis to know which
-writes can affect which reads~\cite{XXX}. In addition writes on newly 
allocated objects
-can never change the value of old existing ones.}
+writes can affect which reads~\cite{diwan_type-based_1998}. In addition writes
+on newly allocated objects can never change the value of old existing ones.}
 
 When that is combined with loop peeling, the single execution of the operation
 is placed in the preamble. That is, loop invariant pure operations and heap
@@ -1091,7 +1091,7 @@
 
 Other interesting interpreters that are helped greatly by this optimization are
 for example our Prolog interpreter written in
-RPython~\cite{carl_friedrich_bolz_towards_2010}. Prolog programs often contain
+RPython~\cite{bolz_towards_2010}. Prolog programs often contain
 tight
 loops that perform list processing. Furthermore we experimented with a Python 
library
 for writing numerical kernels doing array manipulation. The exact extent is
@@ -1105,7 +1105,18 @@
 optimizations described here achieve are not in any way new. However, we think
 that achieving them in the way described in this paper is simpler than writing
 explicit algorithms.
-\cfbolz{more explicit listing of prior work goes here}
+
+Loop invariant code motion has been part of early compilers in the 1960s and
+1970s~\cite{allen_catalogue_1971}. The approach for achieving loop invariant
+code motion is typically to perform partial redundancy elimination. The
+approach was first proposed by Morel and Renvoise~\cite{morel_global_1979}. It
+involves solving data flow problems usually involding bidirection data flow
+equations. After improvements~\cite{chow_portable_1984,
+dhamdhere_practical_1991} this approach was followed by the work of Knoop
+et.al.~\cite{knoop_lazy_1992} who cleany separated the problem into a backward
+and forward data flow analysis. Implementing partial redundancy elimination in
+compilers that use SSA form \cite{chow_new_1997} simplified the algorithms
+because no iterative data flow analysis is needed any more.
 
 As described in the introduction,
 Mike Pall pioneered the approach described in this paper.
@@ -1155,7 +1166,7 @@
 
 The current approach still has some limitations which we plan to address in the
 future. In particular loop peeling works poorly in combination with trace
-trees~\cite{andreas_gal_incremental_2006} or trace 
stitching~\cite{gal_trace-based_2009}.
+trees~\cite{gal_incremental_2006} or trace 
stitching~\cite{gal_trace-based_2009}.
 The side exits attached guards that fail often
 currently have to jump to the preamble which makes loops with several equally
 common paths less efficient than they could be.
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] extradoc extradoc: more in-depth discussion of related work

Reply via email to