From 8d2b1a44e65187e6aa8baf08322c276c69ce43e0 Mon Sep 17 00:00:00 2001
From: Tim Daly
Date: Wed, 6 Apr 2016 13:16:48 -0400
Subject: [PATCH] books/bookvolbib add Ahre15
Goal: Axiom Numerics
@techreport{Ahre15,
author = "Ahrens, Peter and Nguyen, Hong Diep and Demmel, James",
title = "Efficient Reproducible Floating Point Summation and BLAS",
institution = "University of California, Berkeley",
year = "2015",
month = "December",
type = "technical report",
number = "229",
paper = "Ahre15.pdf",
url = "http://www.eecs.berkeley.edu/Pubs/TechRpts/2015/EECS-2015-229.pdf",
abstract =
"We define reproducibility to mean getting bitwise identical results
from multiple runs of the same program, perhaps with different
hardware resources or other changes that should ideally not change the
answer. Many users depend on reproducibility for debugging or
correctness. However, dynamic scheduling of parallel computing
resources, combined with nonassociativity of floating point addition,
makes attaining reproducibility a challenge even for simple operations
like summing a vector of numbers, or more complicated operations like
Basic Linear Algebra Subprograms (BLAS). We describe an algorithm that
computes a reproducible sum of floating point numbers independent of
the order of summation. The algorithm depends only on a subset of the
IEEE Floating Point Standard 754-2008. It is communication-optimal, in
the sense that it does just one pass over the data in the sequential
case, or one reduction operation in the parallel case, requiring an
``accumulator'' represented by just 6 floating point words (more can
be used if higher precision is desired). Th arithmetic code with a
6-word accumulator is $7n$ floating point additions to sum $n$ words,
and (in IEEEE double precision) the final error bound can be up to
$10^8$ times smaller than the error bound for conventional
summation. We describe the basic summation algorithm, the software
infrastructure used to build reproducible BLAS (ReproBLAS), and
performance results. For example, when computing the dot product of
4096 double precision floating point numbers, we get a $4x$ slowdown
compared to Intel Math Kernel Library (MKL) running on an Intel Core
i7-2600 CPU operating at 3.4 GHz and 256 KB L2 Cache."
}
---
books/bookvolbib.pamphlet | 44 ++++++++++++++++++++++++++++++++++++++++
changelog | 2 +
patch | 43 ++++++++++++++++++++++++++++++++++++--
src/axiom-website/patches.html | 2 +
4 files changed, 88 insertions(+), 3 deletions(-)
diff --git a/books/bookvolbib.pamphlet b/books/bookvolbib.pamphlet
index b982f3e..4e9ba6a 100644
--- a/books/bookvolbib.pamphlet
+++ b/books/bookvolbib.pamphlet
@@ -2102,6 +2102,50 @@ when shown in factored form.
\end{chunk}
+\index{Ahrens, Peter}
+\index{Nguyen, Hong Diep}
+\index{Demmel, James}
+\begin{chunk}{axiom.bib}
+@techreport{Ahre15,
+ author = "Ahrens, Peter and Nguyen, Hong Diep and Demmel, James",
+ title = "Efficient Reproducible Floating Point Summation and BLAS",
+ institution = "University of California, Berkeley",
+ year = "2015",
+ month = "December",
+ type = "technical report",
+ number = "229",
+ paper = "Ahre15.pdf",
+ url = "http://www.eecs.berkeley.edu/Pubs/TechRpts/2015/EECS-2015-229.pdf",
+ abstract =
+ "We define reproducibility to mean getting bitwise identical results
+ from multiple runs of the same program, perhaps with different
+ hardware resources or other changes that should ideally not change the
+ answer. Many users depend on reproducibility for debugging or
+ correctness. However, dynamic scheduling of parallel computing
+ resources, combined with nonassociativity of floating point addition,
+ makes attaining reproducibility a challenge even for simple operations
+ like summing a vector of numbers, or more complicated operations like
+ Basic Linear Algebra Subprograms (BLAS). We describe an algorithm that
+ computes a reproducible sum of floating point numbers independent of
+ the order of summation. The algorithm depends only on a subset of the
+ IEEE Floating Point Standard 754-2008. It is communication-optimal, in
+ the sense that it does just one pass over the data in the sequential
+ case, or one reduction operation in the parallel case, requiring an
+ ``accumulator'' represented by just 6 floating point words (more can
+ be used if higher precision is desired). Th arithmetic code with a
+ 6-word accumulator is $7n$ floating point additions to sum $n$ words,
+ and (in IEEEE double precision) the final error bound can be up to
+ $10^8$ times smaller than the error bound for conventional
+ summation. We describe the basic summation algorithm, the software
+ infrastructure used to build reproducible BLAS (ReproBLAS), and
+ performance results. For example, when computing the dot product of
+ 4096 double precision floating point numbers, we get a $4x$ slowdown
+ compared to Intel Math Kernel Library (MKL) running on an Intel Core
+ i7-2600 CPU operating at 3.4 GHz and 256 KB L2 Cache."
+}
+
+\end{chunk}
+
\index{Demmel, James}
\index{Kahan, W.}
\begin{chunk}{axiom.bib}
diff --git a/changelog b/changelog
index 4bbf4bf..8108e74 100644
--- a/changelog
+++ b/changelog
@@ -1,3 +1,5 @@
+20160406 tpd src/axiom-website/patches.html 20160406.02.tpd.patch
+20160406 tpd books/bookvolbib add Ahre15
20160406 tpd src/axiom-website/patches.html 20160406.01.tpd.patch
20160406 tpd books/bookvol10.2 fix unit tests for MATCAT users
20160406 tpd books/bookvol10.3 fix unit tests for MATCAT users
diff --git a/patch b/patch
index 21d41b9..3934f73 100644
--- a/patch
+++ b/patch
@@ -1,5 +1,42 @@
-books/bookvol10.2 fix unit tests for MATCAT users
+books/bookvolbib add Ahre15
-Goal: Axiom Mathematics
+Goal: Axiom Numerics
+
+@techreport{Ahre15,
+ author = "Ahrens, Peter and Nguyen, Hong Diep and Demmel, James",
+ title = "Efficient Reproducible Floating Point Summation and BLAS",
+ institution = "University of California, Berkeley",
+ year = "2015",
+ month = "December",
+ type = "technical report",
+ number = "229",
+ paper = "Ahre15.pdf",
+ url = "http://www.eecs.berkeley.edu/Pubs/TechRpts/2015/EECS-2015-229.pdf",
+ abstract =
+ "We define reproducibility to mean getting bitwise identical results
+ from multiple runs of the same program, perhaps with different
+ hardware resources or other changes that should ideally not change the
+ answer. Many users depend on reproducibility for debugging or
+ correctness. However, dynamic scheduling of parallel computing
+ resources, combined with nonassociativity of floating point addition,
+ makes attaining reproducibility a challenge even for simple operations
+ like summing a vector of numbers, or more complicated operations like
+ Basic Linear Algebra Subprograms (BLAS). We describe an algorithm that
+ computes a reproducible sum of floating point numbers independent of
+ the order of summation. The algorithm depends only on a subset of the
+ IEEE Floating Point Standard 754-2008. It is communication-optimal, in
+ the sense that it does just one pass over the data in the sequential
+ case, or one reduction operation in the parallel case, requiring an
+ ``accumulator'' represented by just 6 floating point words (more can
+ be used if higher precision is desired). Th arithmetic code with a
+ 6-word accumulator is $7n$ floating point additions to sum $n$ words,
+ and (in IEEEE double precision) the final error bound can be up to
+ $10^8$ times smaller than the error bound for conventional
+ summation. We describe the basic summation algorithm, the software
+ infrastructure used to build reproducible BLAS (ReproBLAS), and
+ performance results. For example, when computing the dot product of
+ 4096 double precision floating point numbers, we get a $4x$ slowdown
+ compared to Intel Math Kernel Library (MKL) running on an Intel Core
+ i7-2600 CPU operating at 3.4 GHz and 256 KB L2 Cache."
+}
-MATCAT, MATRIX, U8MAT, U16MAT, U32MAT were updated for zero?
diff --git a/src/axiom-website/patches.html b/src/axiom-website/patches.html
index bafec25..bbb47c3 100644
--- a/src/axiom-website/patches.html
+++ b/src/axiom-website/patches.html
@@ -5286,6 +5286,8 @@ src/axiom-website/documentation.html add blockquotes

books/bookvol10.5.pamphlet remove \uscore

20160406.01.tpd.patch
books/bookvol10.2 fix unit tests for MATCAT users

+20160406.02.tpd.patch
+books/bookvolbib add Ahre15

--
1.7.5.4