Author: Carl Friedrich Bolz <[email protected]>
Branch: extradoc
Changeset: r4570:d3fc033b4584
Date: 2012-08-14 17:37 +0200
http://bitbucket.org/pypy/extradoc/changeset/d3fc033b4584/
Log: more in-depth discussion of related work
diff --git a/talk/dls2012/paper.bib b/talk/dls2012/paper.bib
--- a/talk/dls2012/paper.bib
+++ b/talk/dls2012/paper.bib
@@ -12,7 +12,7 @@
year = {1984}
},
-@inproceedings{carl_friedrich_bolz_towards_2010,
+@inproceedings{bolz_towards_2010,
address = {Hagenberg, Austria},
title = {Towards a Jitting {VM} for Prolog execution},
isbn = {978-1-4503-0132-9},
@@ -21,7 +21,7 @@
abstract = {Most Prolog implementations are implemented in low-level
languages such as C and are based on a variation of the {WAM} instruction set,
which enhances their performance but makes them hard to write. In addition,
many of the more dynamic features of Prolog (like assert), despite their
popularity, are not well supported. We present a high-level continuation-based
Prolog interpreter based on the {PyPy} project. The {PyPy} project makes it
possible to easily and efficiently implement dynamic languages. It provides
tools that automatically generate a just-in-time compiler for a given
interpreter of the target language, by using partial evaluation techniques. The
resulting Prolog implementation is surprisingly efficient: it clearly
outperforms existing interpreters of Prolog in high-level languages such as
Java. Moreover, on some benchmarks, our system outperforms state-of-the-art
{WAM-based} Prolog implementations. Our paper aims to show that declarative
languages such as Pr
olog can indeed benefit from having a just-in-time compiler and that {PyPy}
can form the basis for implementing programming languages other than Python.},
booktitle = {{PPDP}},
publisher = {{ACM}},
- author = {Carl Friedrich Bolz and Michael Leuschel and David Schneider},
+ author = {Bolz, Carl Friedrich and Leuschel, Michael and Schneider,
David},
year = {2010},
keywords = {interpreters, jit, logic programming, partial evaluation}
},
@@ -57,8 +57,25 @@
keywords = {code generation, design, dynamically typed languages,
experimentation, incremental compilers, languages, measurement, performance,
run-time environments, trace-based compilation}
},
+@inproceedings{kotzmann_escape_2005,
+ address = {New York, {NY}, {USA}},
+ series = {{VEE} '05},
+ title = {Escape analysis in the context of dynamic compilation and
deoptimization},
+ isbn = {1-59593-047-7},
+ location = {Chicago, {IL}, {USA}},
+ doi = {10.1145/1064979.1064996},
+ abstract = {In object-oriented programming languages, an object is said
to escape the method or thread in which it was created if it can also be
accessed by other methods or threads. Knowing which objects do not escape
allows a compiler to perform aggressive {optimizations.This} paper presents a
new intraprocedural and interprocedural algorithm for escape analysis in the
context of dynamic compilation where the compiler has to cope with dynamic
class loading and deoptimization. It was implemented for Sun Microsystems' Java
{HotSpot™} client compiler and operates on an intermediate representation
in {SSA} form. We introduce equi-escape sets for the efficient propagation of
escape information between related objects. The analysis is used for scalar
replacement of fields and synchronization removal, as well as for stack
allocation of objects and fixed-sized arrays. The results of the
interprocedural analysis support the compiler in inlining decisions and allow
actual parameters
to be allocated on the caller {stack.Under} certain circumstances, the Java
{HotSpot™} {VM} is forced to stop executing a method's machine code and
transfer control to the interpreter. This is called deoptimization. Since the
interpreter does not know about the scalar replacement and synchronization
removal performed by the compiler, the deoptimization framework was extended to
reallocate and relock objects on demand.},
+ booktitle = {Proceedings of the 1st {ACM/USENIX} international
conference on Virtual execution environments},
+ publisher = {{ACM}},
+ author = {Kotzmann, Thomas and Mössenböck, Hanspeter},
+ year = {2005},
+ note = {{ACM} {ID:} 1064996},
+ keywords = {algorithms, allocation/deallocation strategies,
deoptimization},
+ pages = {111–120}
+},
+
@inproceedings{bolz_towards_2009,
- title = {Towards {Just-In-Time} Partial Evaluation of Prolog},
+ title = {Towards Just-In-Time Partial Evaluation of Prolog},
doi = {10.1007/978-3-642-12592-8_12},
booktitle = {Logic Program Synthesis and Transformation},
author = {Bolz, Carl Friedrich and Leuschel, Michael and Rigo, Armin},
@@ -82,7 +99,7 @@
location = {Barcelona, Spain},
url = {http://dx.doi.org/10.1109/MICRO.2005.22},
doi = {http://dx.doi.org/10.1109/MICRO.2005.22},
- abstract = {The performance of a dynamic optimization system depends
heavily on the code it selects to optimize. Many current systems follow the
design of {HP} Dynamo and select a single interprocedural path, or trace, as
the unit of code optimization and code caching. Though this approach to region
selection has worked well in practice, we show that it is possible to adapt
this basic approach to produce regions with greater locality, less needless
code duplication, and fewer profiling counters. In particular, we propose two
new region-selection algorithms and evaluate them against Dynamo¿s
selection mechanism, {Next-Executing} Tail {(NET).} Our first algorithm,
{Last-Executed} Iteration {(LEI)}, identifies cyclic paths of execution better
than {NET}, improving locality of execution while reducing the size of the code
cache. Our second algorithm allows overlapping traces of similar execution
frequency to be combined into a single large region. This second technique can
be appl
ied to both {NET} and {LEI}, and we find that it significantly improves
metrics of locality and memory overhead for each.},
+ abstract = {The performance of a dynamic optimization system depends
heavily on the code it selects to optimize. Many current systems follow the
design of {HP} Dynamo and select a single interprocedural path, or trace, as
the unit of code optimization and code caching. Though this approach to region
selection has worked well in practice, we show that it is possible to adapt
this basic approach to produce regions with greater locality, less needless
code duplication, and fewer profiling counters. In particular, we propose two
new region-selection algorithms and evaluate them against Dynamo¿s
selection mechanism, Next-Executing Tail {(NET).} Our first algorithm,
Last-Executed Iteration {(LEI)}, identifies cyclic paths of execution better
than {NET}, improving locality of execution while reducing the size of the code
cache. Our second algorithm allows overlapping traces of similar execution
frequency to be combined into a single large region. This second technique can
be applied
to both {NET} and {LEI}, and we find that it significantly improves metrics of
locality and memory overhead for each.},
journal = {Proceedings of the 38th annual {IEEE/ACM} International
Symposium on Microarchitecture},
author = {Hiniker, David and Hazelwood, Kim and Smith, Michael D},
year = {2005},
@@ -102,8 +119,7 @@
@misc{pall_luajit_2009,
title = {{LuaJIT} 2.0 intellectual property disclosure and research
opportunities},
- note = {http://lua-users.org/lists/lua-l/2009-11/msg00089.html (accessed
- June 2011)},
+ url = {http://lua-users.org/lists/lua-l/2009-11/msg00089.html},
author = {Pall, Mike},
month = nov,
year = {2009}
@@ -111,7 +127,7 @@
@inproceedings{bolz_runtime_2011,
address = {Lancaster, {UK}},
- title = {Runtime Feedback in a {Meta-Tracing} {JIT} for Efficient
Dynamic Languages},
+ title = {Runtime Feedback in a Meta-Tracing {JIT} for Efficient Dynamic
Languages},
abstract = {Meta-tracing {JIT} compilers can be applied to a variety of
different languages without explicitly encoding language semantics into the
compiler. So far, they lacked a way to give the language implementor control
over runtime feedback. This restricted their performance. In this paper we
describe the mechanisms in {PyPy’s} meta-tracing {JIT} that can be used
to control runtime feedback in language-specific ways. These mechanisms are
flexible enough to express classical {VM} techniques such as maps and runtime
type feedback.},
booktitle = {{ICOOOLPS}},
publisher = {{ACM}},
@@ -135,7 +151,115 @@
pages = {71--80}
},
-@inproceedings{davide_ancona_rpython:_2007,
+@article{diwan_type-based_1998,
+ title = {Type-based alias analysis},
+ volume = {33},
+ issn = {0362-1340},
+ url = {http://doi.acm.org/10.1145/277652.277670},
+ doi = {10.1145/277652.277670},
+ abstract = {This paper evaluates three alias analyses based on
programming language types. The first analysis uses type compatibility to
determine aliases. The second extends the first by using additional high-level
information such as field names. The third extends the second with a
flow-insensitive analysis. Although other researchers suggests using types to
disambiguate memory references, none evaluates its effectiveness. We perform
both static and dynamic evaluations of type-based alias analyses for Modula-3,
a statically-typed type-safe language. The static analysis reveals that type
compatibility alone yields a very imprecise alias analysis, but the other two
analyses significantly improve alias precision. We use redundant load
elimination {(RLE)} to demonstrate the effectiveness of the three alias
algorithms in terms of the opportunities for optimization, the impact on
simulated execution times, and to compute an upper bound on what a perfect
alias analysis would yield. We s
how modest dynamic improvements for {(RLE)}, and more surprisingly, that on
average our alias analysis is within 2.5\% of a perfect alias analysis with
respect to {RLE} on 8 Modula-3 programs. These results illustrate that to
explore thoroughly the effectiveness of alias analyses, researchers need
static, dynamic, and upper-bound analysis. In addition, we show that for
type-safe languages like Modula-3 and Java, a fast and simple alias analysis
may be sufficient for many applications.},
+ number = {5},
+ journal = {{SIGPLAN} Not.},
+ author = {Diwan, Amer and {McKinley}, Kathryn S. and Moss, J. Eliot B.},
+ month = may,
+ year = {1998},
+ pages = {106–117}
+},
+
+@inproceedings{cytron_code_1986,
+ address = {New York, {NY}, {USA}},
+ series = {{POPL} '86},
+ title = {Code motion of control structures in high-level languages},
+ url = {http://doi.acm.org/10.1145/512644.512651},
+ doi = {10.1145/512644.512651},
+ abstract = {One trend among programmers is the increased use of
abstractions. Through encapsulation techniques, abstractions extend the
repertory of data structures and their concomitant operations that are
processed directly by a compiler. For example, a compiler might not offer sets
or set operations in its base language, but abstractions allow a programmer to
define sets in terms of constructs already recognized by the compiler. In
particular, abstractions can allow new constructs to be defined in terms of
other abstractions. Although significant power is gained through the use of
layered abstractions, object code quality suffers as increasingly less of a
program's data structures and operations are exposed to the optimization phase
of a compiler. Multiple references to abstractions are also inefficient, since
the interaction between abstractions is often complex yet hidden from a
compiler. Abstractions are most flexible when they are cast in general terms; a
specific invocation
is then tailored by the abstraction to obtain the appropriate code. A
sequence of references to such abstractions can be inefficient due to
functional redundancy that cannot be detected at compile-time. By integrating
the references, the offending segments of code can be moved to a more
advantageous position. Although procedure integration materializes abstracted
constructs, the abstractions can still be ineligible for optimization using
current techniques; in particular, abstractions often involve loops and
conditional branches that can obscure code that would otherwise be eligible for
code motion.},
+ booktitle = {Proceedings of the 13th {ACM} {SIGACT-SIGPLAN} symposium
on Principles of programming languages},
+ publisher = {{ACM}},
+ author = {Cytron, Ron and Lowry, Andy and Zadeck, F. Kenneth},
+ year = {1986},
+ pages = {70–85}
+},
+
+@article{knoop_lazy_1992,
+ title = {Lazy code motion},
+ volume = {27},
+ issn = {0362-1340},
+ url = {http://doi.acm.org/10.1145/143103.143136},
+ doi = {10.1145/143103.143136},
+ abstract = {We present a bit-vector algorithm for the optimal and
economical placement of computations within flow graphs, which is as efficient
as standard uni-directional analyses. The point of our algorithm is the
decomposition of the bi-directional structure of the known placement algorithms
into a sequence of a backward and a forward analysis, which directly implies
the efficiency result. Moreover, the new compositional structure opens the
algorithm for modification: two further uni-directional analysis components
exclude any unnecessary code motion. This laziness of our algorithm minimizes
the register pressure, which has drastic effects on the run-time behaviour of
the optimized programs in practice, where an economical use of registers is
essential.},
+ number = {7},
+ journal = {{SIGPLAN} Not.},
+ author = {Knoop, Jens and Rüthing, Oliver and Steffen, Bernhard},
+ month = jul,
+ year = {1992},
+ pages = {224–234}
+},
+
+@incollection{allen_catalogue_1971,
+ title = {A Catalogue of Optimizing Transformations, ed. R. Rustin},
+ booktitle = {Design and Optimization of Compilers},
+ publisher = {Prentice-Hall},
+ author = {Allen, Frances and Cocke, John},
+ editor = {Rustin, Randall},
+ year = {1971},
+ pages = {1--30}
+},
+
+@inproceedings{chow_new_1997,
+ address = {New York, {NY}, {USA}},
+ series = {{PLDI} '97},
+ title = {A new algorithm for partial redundancy elimination based on
{SSA} form},
+ isbn = {0-89791-907-6},
+ url = {http://doi.acm.org/10.1145/258915.258940},
+ doi = {10.1145/258915.258940},
+ abstract = {A new algorithm, {SSAPRE}, for performing partial
redundancy elimination based entirely on {SSA} form is presented. It achieves
optimal code motion similar to lazy code motion {[KRS94a}, {DS93]}, but is
formulated independently and does not involve iterative data flow analysis and
bit vectors in its solution. It not only exhibits the characteristics common to
other sparse approaches, but also inherits the advantages shared by other
{SSA-based} optimization techniques. {SSAPRE} also maintains its output in the
same {SSA} form as its input. In describing the algorithm, we state theorems
with proofs giving our claims about {SSAPRE.} We also give additional
description about our practical implementation of {SSAPRE}, and analyze and
compare its performance with a bit-vector-based implementation of {PRE.} We
conclude with some discussion of the implications of this work.},
+ booktitle = {Proceedings of the {ACM} {SIGPLAN} 1997 conference on
Programming language design and implementation},
+ publisher = {{ACM}},
+ author = {Chow, Fred and Chan, Sun and Kennedy, Robert and Liu,
Shin-Ming and Lo, Raymond and Tu, Peng},
+ year = {1997},
+ pages = {273–286}
+},
+
+@article{morel_global_1979,
+ title = {Global optimization by suppression of partial redundancies},
+ volume = {22},
+ issn = {0001-0782},
+ url = {http://doi.acm.org/10.1145/359060.359069},
+ doi = {10.1145/359060.359069},
+ abstract = {The elimination of redundant computations and the moving of
invariant computations out of loops are often done separately, with invariants
moved outward loop by loop. We propose to do both at once and to move each
expression directly to the entrance of the outermost loop in which it is
invariant. This is done by solving a more general problem, i.e. the elimination
of computations performed twice on a given execution path. Such computations
are termed partially redundant. Moreover, the algorithm does not require any
graphical information or restrictions on the shape of the program graph.
Testing this algorithm has shown that its execution cost is nearly linear with
the size of the program, and that it leads to a smaller optimizer that requires
less execution time.},
+ number = {2},
+ journal = {Commun. {ACM}},
+ author = {Morel, E. and Renvoise, C.},
+ month = feb,
+ year = {1979},
+ keywords = {Boolean systems, compilation, compiler, data flow analysis,
invariant computation elimination, optimization, optimizer, partial redundancy,
redundancy elimination},
+ pages = {96–103}
+},
+
+@article{dhamdhere_practical_1991,
+ title = {Practical adaption of the global optimization algorithm of
Morel and Renvoise},
+ volume = {13},
+ issn = {0164-0925},
+ url = {http://doi.acm.org/10.1145/103135.214520},
+ doi = {10.1145/103135.214520},
+ number = {2},
+ journal = {{ACM} Trans. Program. Lang. Syst.},
+ author = {Dhamdhere, D. M.},
+ month = apr,
+ year = {1991},
+ pages = {291–294}
+},
+
+@phdthesis{chow_portable_1984,
+ address = {Stanford, {CA}, {USA}},
+ title = {A portable machine-independent global optimizer–design
and measurements},
+ school = {Stanford University},
+ author = {Chow, Frederick Chi-Tak},
+ year = {1984},
+ note = {{AAI8408268}}
+},
+
+@inproceedings{ancona_rpython:_2007,
address = {Montreal, Quebec, Canada},
title = {{RPython:} a step towards reconciling dynamically and
statically typed {OO} languages},
isbn = {978-1-59593-868-8},
@@ -145,17 +269,17 @@
abstract = {Although the C-based interpreter of Python is reasonably
fast, implementations on the {CLI} or the {JVM} platforms offers some
advantages in terms of robustness and interoperability. Unfortunately, because
the {CLI} and {JVM} are primarily designed to execute statically typed,
object-oriented languages, most dynamic language implementations cannot use the
native bytecodes for common operations like method calls and exception
handling; as a result, they are not able to take full advantage of the power
offered by the {CLI} and {JVM.}},
booktitle = {{DLS}},
publisher = {{ACM}},
- author = {Davide Ancona and Massimo Ancona and Antonio Cuni and
Nicholas D. Matsakis},
+ author = {Ancona, Davide and Ancona, Massimo and Cuni, Antonio and
Matsakis, Nicholas D.},
year = {2007},
keywords = {{JVM}, .net, Python}
},
@article{futamura_partial_1999,
- title = {Partial Evaluation of Computation Process - An Approach to a
{Compiler-Compiler}},
+ title = {Partial Evaluation of Computation Process - An Approach to a
Compiler-Compiler},
volume = {12},
url = {http://citeseer.ist.psu.edu/futamura99partial.html},
number = {4},
- journal = {{Higher-Order} and Symbolic Computation},
+ journal = {Higher-Order and Symbolic Computation},
author = {Futamura, Yoshihiko},
year = {1999},
keywords = {Futamura},
@@ -167,31 +291,31 @@
isbn = {0-13-020249-5},
url = {http://portal.acm.org/citation.cfm?id=153676},
abstract = {This book is out of print. For copies, Please refer to the
following online page},
- publisher = {{Prentice-Hall}},
+ publisher = {Prentice-Hall},
author = {Jones, Neil D. and Gomard, Carsten K. and Sestoft, Peter},
year = {1993}
},
-@inproceedings{armin_rigo_pypys_2006,
+@inproceedings{rigo_pypys_2006,
address = {Portland, Oregon, {USA}},
title = {{PyPy's} approach to virtual machine construction},
- isbn = {{1-59593-491-X}},
+ isbn = {1-59593-491-X},
url = {http://portal.acm.org/citation.cfm?id=1176753},
doi = {10.1145/1176617.1176753},
abstract = {The {PyPy} project seeks to prove both on a research and a
practical level the feasibility of constructing a virtual machine {(VM)} for a
dynamic language in a dynamic language - in this case, Python. The aim is to
translate (i.e. compile) the {VM} to arbitrary target environments, ranging in
level from {C/Posix} to {Smalltalk/Squeak} via Java and {CLI/.NET}, while still
being of reasonable efficiency within these {environments.A} key tool to
achieve this goal is the systematic reuse of the Python language as a system
programming language at various levels of our architecture and translation
process. For each level, we design a corresponding type system and apply a
generic type inference engine - for example, the garbage collector is written
in a style that manipulates simulated pointer and address objects, and when
translated to C these operations become C-level pointer and address
instructions.},
booktitle = {{DLS}},
publisher = {{ACM}},
- author = {Armin Rigo and Samuele Pedroni},
+ author = {Rigo, Armin and Pedroni, Samuele},
year = {2006},
keywords = {metacircularity, Python, retargettable code generation,
type inference, {VM}}
},
@article{georges_statistically_2007,
- title = {Statistically rigorous java performance evaluation},
+ title = {Statistically rigorous Java performance evaluation},
volume = {42},
url = {http://portal.acm.org/citation.cfm?id=1297105.1297033},
doi = {10.1145/1297105.1297033},
- abstract = {Java performance is far from being trivial to benchmark
because it is affected by various factors such as the Java application, its
input, the virtual machine, the garbage collector, the heap size, etc. In
addition, non-determinism at run-time causes the execution time of a Java
program to differ from run to run. There are a number of sources of
non-determinism such as {Just-In-Time} {(JIT)} compilation and optimization in
the virtual machine {(VM)} driven by timer-based method sampling, thread
scheduling, garbage collection, and various.},
+ abstract = {Java performance is far from being trivial to benchmark
because it is affected by various factors such as the Java application, its
input, the virtual machine, the garbage collector, the heap size, etc. In
addition, non-determinism at run-time causes the execution time of a Java
program to differ from run to run. There are a number of sources of
non-determinism such as Just-In-Time {(JIT)} compilation and optimization in
the virtual machine {(VM)} driven by timer-based method sampling, thread
scheduling, garbage collection, and various.},
number = {10},
journal = {{SIGPLAN} Notices},
author = {Georges, Andy and Buytaert, Dries and Eeckhout, Lieven},
@@ -228,12 +352,12 @@
pages = {1--12}
},
-@techreport{andreas_gal_incremental_2006,
+@techreport{gal_incremental_2006,
title = {Incremental Dynamic Code Generation with Trace Trees},
abstract = {The unit of compilation for traditional just-in-time
compilers is the method. We have explored trace-based compilation, in which the
unit of compilation is a loop, potentially spanning multiple methods and even
library code. Using a new intermediate representation that is discovered and
updated lazily on-demand while the program is being executed, our compiler
generates code that is competitive with traditional dynamic compilers, but that
uses only a fraction of the compile time and memory footprint.},
number = {{ICS-TR-06-16}},
institution = {Donald Bren School of Information and Computer Science,
University of California, Irvine},
- author = {Andreas Gal and Michael Franz},
+ author = {Gal, Andreas and Franz, Michael},
month = nov,
year = {2006},
pages = {11}
@@ -254,19 +378,19 @@
keywords = {dynamic compilation, embedded, software trace scheduling,
{SSA}, {VM}}
},
-@inproceedings{mario_wolczko_towards_1999,
- title = {Towards a Universal Implementation Substrate for
{Object-Oriented} Languages},
+@inproceedings{wolczko_towards_1999,
+ title = {Towards a Universal Implementation Substrate for
Object-Oriented Languages},
abstract = {Self is a minimalist object-oriented language with a
sophisticated implementation that utilizes adaptive optimization. We have built
implementations of Smalltalk and Java by translation to Self. These
implementations were much easier to construct in Self than by conventional
means, and perform surprisingly well (competitively with conventional,
commercial implementations). This leads us to believe that a Self-like system
may form the basis of a universal substrate for implementation of
object-oriented languages.},
booktitle = {{OOPSLA} workshop on Simplicity, Performance, and
Portability in Virtual Machine Design},
- author = {Mario Wolczko and Ole Agesen and David Ungar},
+ author = {Wolczko, Mario and Agesen, Ole and {{David} Ungar}},
year = {1999},
keywords = {fixme}
},
-@inproceedings{hoelzle_optimizing_1994,
+@inproceedings{holzle_optimizing_1994,
address = {Orlando, Florida, United States},
title = {Optimizing dynamically-dispatched calls with run-time type
feedback},
- isbn = {{0-89791-662-X}},
+ isbn = {0-89791-662-X},
url = {http://portal.acm.org/citation.cfm?id=178243.178478},
doi = {10.1145/178243.178478},
abstract = {Note: {OCR} errors may be found in this Reference List
extracted from the full text article. {ACM} has opted to expose the complete
List rather than only correct and linked references.},
@@ -306,17 +430,17 @@
doi = {10.1145/74878.74884},
abstract = {We have developed and implemented techniques that double
the performance of dynamically-typed object-oriented languages. Our {SELF}
implementation runs twice as fast as the fastest Smalltalk implementation,
despite {SELF's} lack of classes and explicit variables. To compensate for the
absence of classes, our system uses implementation-level maps to transparently
group objects cloned from the same prototype, providing data type information
and eliminating the apparent space overhead for prototype-based systems. To
compensate for dynamic typing, user-defined control structures, and the lack of
explicit variables, our system dynamically compiles multiple versions of a
source method, each customized according to its receiver's map. Within each
version the type of the receiver is fixed, and thus the compiler can statically
bind and inline all messages sent to self. Message splitting and type
prediction extract and preserve even more static type information, allowing the
comp
iler to inline many other messages. Inlining dramatically improves performance
and eliminates the need to hard-wire low-level methods such as +,==, and
{ifTrue:.} Despite inlining and other optimizations, our system still supports
interactive programming environments. The system traverses internal dependency
lists to invalidate all compiled methods affected by a programming change. The
debugger reconstructs inlined stack frames from compiler-generated debugging
information, making inlining invisible to the {SELF} programmer.},
booktitle = {{OOPSLA}},
- author = {Chambers, C. and Ungar, D. and E. Lee},
+ author = {Chambers, C. and Ungar, D. and {{E.} Lee}},
year = {1989},
keywords = {self, specialization}
},
-@inproceedings{hoelzle_optimizing_1991,
- title = {Optimizing {Dynamically-Typed} {Object-Oriented} Languages
With Polymorphic Inline Caches},
+@inproceedings{holzle_optimizing_1991,
+ title = {Optimizing Dynamically-Typed Object-Oriented Languages With
Polymorphic Inline Caches},
isbn = {3-540-54262-0},
url = {http://portal.acm.org/citation.cfm?id=679193&dl=ACM&coll=portal},
booktitle = {{ECOOP}},
- publisher = {{Springer-Verlag}},
+ publisher = {Springer-Verlag},
author = {Hölzle, Urs and Chambers, Craig and Ungar, David},
year = {1991}
},
@@ -346,23 +470,4 @@
publisher = {{ACM}},
author = {Sullivan, Gregory T. and Bruening, Derek L. and Baron, Iris
and Garnett, Timothy and Amarasinghe, Saman},
year = {2003}
-}
-
-@inproceedings{kotzmann_escape_2005,
- address = {New York, {NY}, {USA}},
- series = {{VEE} '05},
- title = {Escape analysis in the context of dynamic compilation and
deoptimization},
- isbn = {1-59593-047-7},
- location = {Chicago, {IL}, {USA}},
- doi = {10.1145/1064979.1064996},
- abstract = {In object-oriented programming languages, an object is said
to escape the method or thread in which it was created if it can also be
accessed by other methods or threads. Knowing which objects do not escape
allows a compiler to perform aggressive {optimizations.This} paper presents a
new intraprocedural and interprocedural algorithm for escape analysis in the
context of dynamic compilation where the compiler has to cope with dynamic
class loading and deoptimization. It was implemented for Sun Microsystems' Java
{HotSpot™} client compiler and operates on an intermediate representation
in {SSA} form. We introduce equi-escape sets for the efficient propagation of
escape information between related objects. The analysis is used for scalar
replacement of fields and synchronization removal, as well as for stack
allocation of objects and fixed-sized arrays. The results of the
interprocedural analysis support the compiler in inlining decisions and allow
actual parameters
to be allocated on the caller {stack.Under} certain circumstances, the Java
{HotSpot™} {VM} is forced to stop executing a method's machine code and
transfer control to the interpreter. This is called deoptimization. Since the
interpreter does not know about the scalar replacement and synchronization
removal performed by the compiler, the deoptimization framework was extended to
reallocate and relock objects on demand.},
- booktitle = {Proceedings of the 1st {ACM/USENIX} international
conference on Virtual execution environments},
- publisher = {{ACM}},
- author = {Kotzmann, Thomas and Mössenböck, Hanspeter},
- year = {2005},
- note = {{ACM} {ID:} 1064996},
- keywords = {algorithms, allocation/deallocation strategies,
deoptimization},
- pages = {111–120}
-},
-
-
+}
\ No newline at end of file
diff --git a/talk/dls2012/paper.tex b/talk/dls2012/paper.tex
--- a/talk/dls2012/paper.tex
+++ b/talk/dls2012/paper.tex
@@ -208,8 +208,8 @@
The work described in this paper was done in the context of the PyPy
project.\footnote{\texttt{http://pypy.org}} PyPy is a framework for
implementing
-dynamic languages efficiently~\cite{armin_rigo_pypys_2006}. When implementing a
-language with PyPy, one writes an interpreter for the language in
RPython~\cite{davide_ancona_rpython:_2007}.
+dynamic languages efficiently~\cite{rigo_pypys_2006}. When implementing a
+language with PyPy, one writes an interpreter for the language in
RPython~\cite{ancona_rpython:_2007}.
RPython (``Restricted Python``) is a subset
of Python chosen in such a way that it can be efficiently translated to a
C-based VM by performing type inference.
@@ -672,8 +672,8 @@
can be reused for all other appearances. RPython's optimizers can also remove
repeated heap reads if the intermediate operations cannot have changed their
value.\footnote{We perform a type-based alias analysis to know which
-writes can affect which reads~\cite{XXX}. In addition writes on newly
allocated objects
-can never change the value of old existing ones.}
+writes can affect which reads~\cite{diwan_type-based_1998}. In addition writes
+on newly allocated objects can never change the value of old existing ones.}
When that is combined with loop peeling, the single execution of the operation
is placed in the preamble. That is, loop invariant pure operations and heap
@@ -1091,7 +1091,7 @@
Other interesting interpreters that are helped greatly by this optimization are
for example our Prolog interpreter written in
-RPython~\cite{carl_friedrich_bolz_towards_2010}. Prolog programs often contain
+RPython~\cite{bolz_towards_2010}. Prolog programs often contain
tight
loops that perform list processing. Furthermore we experimented with a Python
library
for writing numerical kernels doing array manipulation. The exact extent is
@@ -1105,7 +1105,18 @@
optimizations described here achieve are not in any way new. However, we think
that achieving them in the way described in this paper is simpler than writing
explicit algorithms.
-\cfbolz{more explicit listing of prior work goes here}
+
+Loop invariant code motion has been part of early compilers in the 1960s and
+1970s~\cite{allen_catalogue_1971}. The approach for achieving loop invariant
+code motion is typically to perform partial redundancy elimination. The
+approach was first proposed by Morel and Renvoise~\cite{morel_global_1979}. It
+involves solving data flow problems usually involding bidirection data flow
+equations. After improvements~\cite{chow_portable_1984,
+dhamdhere_practical_1991} this approach was followed by the work of Knoop
+et.al.~\cite{knoop_lazy_1992} who cleany separated the problem into a backward
+and forward data flow analysis. Implementing partial redundancy elimination in
+compilers that use SSA form \cite{chow_new_1997} simplified the algorithms
+because no iterative data flow analysis is needed any more.
As described in the introduction,
Mike Pall pioneered the approach described in this paper.
@@ -1155,7 +1166,7 @@
The current approach still has some limitations which we plan to address in the
future. In particular loop peeling works poorly in combination with trace
-trees~\cite{andreas_gal_incremental_2006} or trace
stitching~\cite{gal_trace-based_2009}.
+trees~\cite{gal_incremental_2006} or trace
stitching~\cite{gal_trace-based_2009}.
The side exits attached guards that fail often
currently have to jump to the preamble which makes loops with several equally
common paths less efficient than they could be.
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit