OPTIMIZATIONS

   1 (defun mysl (s)
   2     (declare (simple-string s))
   3     (declare (optimize (speed 3) (safety 0) (debug 0)))
   4     (let ((c 0))
   5       (declare (fixnum c))
   6       (dotimes (i (length s))
   7         (when (eql (aref s i) #\1)
   8           (incf c)))
   9       c))
  10
  11 * On X86 I is represented as a tagged integer.
  12
  13 * EQL uses "CMP reg,reg" instead of "CMP reg,im". This causes
  14   allocation of an extra register and an extra move.
  15
  16 * Unnecessary move:
  17   3: SLOT S!11[EDX] {SB-C::VECTOR-LENGTH 1 7} => t23[EAX]
  18   4: MOVE t23[EAX] => t24[EBX]
  19
  20 --------------------------------------------------------------------------------
  21 (defun quux (v)
  22   (declare (optimize (speed 3) (safety 0) (space 2) (debug 0)))
  23   (declare (type (simple-array double-float 1) v))
  24   (let ((s 0d0))
  25     (declare (type double-float s))
  26     (dotimes (i (length v))
  27       (setq s (+ s (aref v i))))
  28     s))
  29
  30 * Python does not combine + with AREF, so generates extra move and
  31   allocates a register.
  32
  33 * On X86 Python thinks that all FP registers are directly accessible
  34   and emits costy MOVE ... => FR1.
  35
  36 --------------------------------------------------------------------------------
  37 (defun bar (n)
  38   (declare (optimize (speed 3) (safety 0) (space 2))
  39            (type fixnum n))
  40   (let ((v (make-list n)))
  41     (setq v (make-array n))
  42     (length v)))
  43
  44 * IR1 does not optimize away (MAKE-LIST N).
  45 --------------------------------------------------------------------------------
  46 (defun bar (v1 v2)
  47   (declare (optimize (speed 3) (safety 0) (space 2))
  48            (type (simple-array base-char 1) v1 v2))
  49   (dotimes (i (length v1))
  50     (setf (aref v2 i) (aref v1 i))))
  51
  52 VOP DATA-VECTOR-SET/SIMPLE-STRING V2!14[EDI] t32[EAX] t30[S2]>t33[CL]
  53                                   => t34[S2]<t35[AL]
  54         MOV     #<TN t33[CL]>, #<TN t30[S2]>
  55         MOV     BYTE PTR [EDI+EAX+1], #<TN t33[CL]>
  56         MOV     #<TN t35[AL]>, #<TN t33[CL]>
  57         MOV     #<TN t34[S2]>, #<TN t35[AL]>
  58
  59 * The value of DATA-VECTOR-SET is not used, so there is no need in the
  60   last two moves.
  61
  62 * And why two moves?
  63 --------------------------------------------------------------------------------
  64 (loop repeat 1.5)
  65
  66 uses generic arithmetic
  67 --------------------------------------------------------------------------------
  68 09:49:05 <jtra> I have found a case in those where suboptimal code is
  69   generate with nested loops, it might be moderately easy to fix that
  70 09:49:28 <jtra> see
  71   http://www.bagley.org/~doug/shootout/bench/nestedloop/nestedloop.cmucl
  72 09:50:30 <jtra> if you add declarations to dotimes, generated code is
  73   almost optimal, but most inner loops run out of registers and use
  74   memory location for iteration variable
  75
  76 ;;; -*- mode: lisp -*-
  77 ;;; $Id$
  78 ;;; http://www.bagley.org/~doug/shootout/
  79 ;;; from Friedrich Dominicus
  80
  81 (defun main ()
  82   (let ((n (parse-integer (or (car (last extensions:*command-line-strings*)) "1")))
  83         (x 0))
  84     (declare (fixnum n)
  85              (fixnum x)
  86              (optimize (speed 3) (debug 0) (safety 0)))
  87     (dotimes (a n)
  88       (dotimes (b n)
  89         (dotimes (c n)
  90           (dotimes (d n)
  91             (dotimes (e n)
  92               (dotimes (f n)
  93                 (incf x)))))))
  94    (format t "~A~%" x)))
  95 --------------------------------------------------------------------------------
  96 (defun foo (x)
  97   (declare (optimize speed (debug 0)))
  98   (if (< x 0) x (foo (1- x))))
  99
 100 SBCL generates a full call of FOO (but CMUCL does not).
 101 --------------------------------------------------------------------------------
 102 (defun foo (d)
 103   (declare (optimize (speed 3) (safety 0) (debug 0)))
 104   (declare (type (double-float 0d0 1d0) d))
 105   (loop for i fixnum from 1 to 5
 106      for x1 double-float = (sin d) ;;; !!!
 107      do (loop for j fixnum from 1 to 4
 108              sum x1 double-float)))
 109
 110 Without the marked declaration Python will use boxed representation for X1.
 111
 112 This is equivalent to
 113
 114 (let ((x nil))
 115   (setq x 0d0)
 116   ;; use of X as DOUBLE-FLOAT
 117 )
 118
 119 The initial binding is effectless, and without it X is of type
 120 DOUBLE-FLOAT. Unhopefully, IR1 does not optimize away effectless
 121 SETs/bindings, and IR2 does not perform type inference.
 122 --------------------------------------------------------------------------------
 123 (defun foo (x)
 124   (if (= (cond ((irgh x) 0)
 125                ((buh x) 1)
 126                (t 2))
 127          0)
 128       :yes
 129       :no))
 130
 131 This code could be optimized to
 132
 133 (defun foo (x)
 134   (cond ((irgh x) :yes)
 135         ((buh x) :no)
 136         (t :no)))
 137 --------------------------------------------------------------------------------