OPTIMIZATIONS

   1 #1
   2 (defun mysl (s)
   3     (declare (simple-string s))
   4     (declare (optimize (speed 3) (safety 0) (debug 0)))
   5     (let ((c 0))
   6       (declare (fixnum c))
   7       (dotimes (i (length s))
   8         (when (eql (aref s i) #\1)
   9           (incf c)))
  10       c))
  11
  12 * On X86 I is represented as a tagged integer.
  13
  14 * Unnecessary move:
  15   3: SLOT S!11[EDX] {SB-C::VECTOR-LENGTH 1 7} => t23[EAX]
  16   4: MOVE t23[EAX] => t24[EBX]
  17
  18 --------------------------------------------------------------------------------
  19 #2
  20 (defun quux (v)
  21   (declare (optimize (speed 3) (safety 0) (space 2) (debug 0)))
  22   (declare (type (simple-array double-float 1) v))
  23   (let ((s 0d0))
  24     (declare (type double-float s))
  25     (dotimes (i (length v))
  26       (setq s (+ s (aref v i))))
  27     s))
  28
  29 * Python does not combine + with AREF, so generates extra move and
  30   allocates a register.
  31
  32 * On X86 Python thinks that all FP registers are directly accessible
  33   and emits costy MOVE ... => FR1.
  34
  35 --------------------------------------------------------------------------------
  36 #3
  37 (defun bar (n)
  38   (declare (optimize (speed 3) (safety 0) (space 2))
  39            (type fixnum n))
  40   (let ((v (make-list n)))
  41     (setq v (make-array n))
  42     (length v)))
  43
  44 * IR1 does not optimize away (MAKE-LIST N).
  45 --------------------------------------------------------------------------------
  46 #4
  47 (defun bar (v1 v2)
  48   (declare (optimize (speed 3) (safety 0) (space 2))
  49            (type (simple-array base-char 1) v1 v2))
  50   (dotimes (i (length v1))
  51     (setf (aref v2 i) (aref v1 i))))
  52
  53 VOP DATA-VECTOR-SET/SIMPLE-STRING V2!14[EDI] t32[EAX] t30[S2]>t33[CL]
  54                                   => t34[S2]<t35[AL]
  55         MOV     #<TN t33[CL]>, #<TN t30[S2]>
  56         MOV     BYTE PTR [EDI+EAX+1], #<TN t33[CL]>
  57         MOV     #<TN t35[AL]>, #<TN t33[CL]>
  58         MOV     #<TN t34[S2]>, #<TN t35[AL]>
  59
  60 * The value of DATA-VECTOR-SET is not used, so there is no need in the
  61   last two moves.
  62
  63 * And why two moves?
  64 --------------------------------------------------------------------------------
  65 #6
  66 09:49:05 <jtra> I have found a case in those where suboptimal code is
  67   generate with nested loops, it might be moderately easy to fix that
  68 09:49:28 <jtra> see
  69   http://www.bagley.org/~doug/shootout/bench/nestedloop/nestedloop.cmucl
  70 09:50:30 <jtra> if you add declarations to dotimes, generated code is
  71   almost optimal, but most inner loops run out of registers and use
  72   memory location for iteration variable
  73
  74 ;;; -*- mode: lisp -*-
  75 ;;; http://www.bagley.org/~doug/shootout/
  76 ;;; from Friedrich Dominicus
  77
  78 (defun main ()
  79   (let ((n (parse-integer (or (car (last extensions:*command-line-strings*)) "1")))
  80         (x 0))
  81     (declare (fixnum n)
  82              (fixnum x)
  83              (optimize (speed 3) (debug 0) (safety 0)))
  84     (dotimes (a n)
  85       (dotimes (b n)
  86         (dotimes (c n)
  87           (dotimes (d n)
  88             (dotimes (e n)
  89               (dotimes (f n)
  90                 (incf x)))))))
  91    (format t "~A~%" x)))
  92 --------------------------------------------------------------------------------
  93 #7
  94 (defun foo (x)
  95   (declare (optimize speed (debug 0)))
  96   (if (< x 0) x (foo (1- x))))
  97
  98 SBCL generates a full call of FOO (but CMUCL does not).
  99
 100 Partial explanation: CMUCL does generate a full (tail) call to FOO if
 101 *BLOCK-COMPILE* is NIL.  Maybe this is because in that case CMUCL doesn't
 102 generate a temporary(?) function in its IR1-TRANSLATOR for %DEFUN?
 103 --------------------------------------------------------------------------------
 104 #8
 105 (defun foo (d)
 106   (declare (optimize (speed 3) (safety 0) (debug 0)))
 107   (declare (type (double-float 0d0 1d0) d))
 108   (loop for i fixnum from 1 to 5
 109      for x1 double-float = (sin d) ;;; !!!
 110      do (loop for j fixnum from 1 to 4
 111              sum x1 double-float)))
 112
 113 Without the marked declaration Python will use boxed representation for X1.
 114
 115 This is equivalent to
 116
 117 (let ((x nil))
 118   (setq x 0d0)
 119   ;; use of X as DOUBLE-FLOAT
 120 )
 121
 122 The initial binding is effectless, and without it X is of type
 123 DOUBLE-FLOAT. Unhopefully, IR1 does not optimize away effectless
 124 SETs/bindings, and IR2 does not perform type inference.
 125 --------------------------------------------------------------------------------
 126 #9 "Multi-path constant folding"
 127 (defun foo (x)
 128   (if (= (cond ((irgh x) 0)
 129                ((buh x) 1)
 130                (t 2))
 131          0)
 132       :yes
 133       :no))
 134
 135 This code could be optimized to
 136
 137 (defun foo (x)
 138   (cond ((irgh x) :yes)
 139         ((buh x) :no)
 140         (t :no)))
 141 --------------------------------------------------------------------------------
 142 #11
 143 (inverted variant of #9)
 144
 145 (lambda (x)
 146   (let ((y (sap-alien x c-string)))
 147     (list (alien-sap y)
 148           (alien-sap y))))
 149
 150 It could be optimized to
 151
 152 (lambda (x) (list x x))
 153
 154 (if Y were used only once, the current compiler would optimize it)
 155 --------------------------------------------------------------------------------
 156 #12
 157 (typep (truly-the (simple-array * (*)) x) 'simple-vector)
 158
 159 tests lowtag.
 160 --------------------------------------------------------------------------------
 161 #13
 162 FAST-+/FIXNUM and similar should accept unboxed arguments in interests
 163 of representation selection. Problem: inter-TN dependencies.
 164 --------------------------------------------------------------------------------