OPTIMIZATIONS

   1 #1
   2 (defun mysl (s)
   3     (declare (simple-string s))
   4     (declare (optimize (speed 3) (safety 0) (debug 0)))
   5     (let ((c 0))
   6       (declare (fixnum c))
   7       (dotimes (i (length s))
   8         (when (eql (aref s i) #\1)
   9           (incf c)))
  10       c))
  11
  12 * On X86 I is represented as a tagged integer.
  13
  14 * Unnecessary move:
  15   3: SLOT S!11[EDX] {SB-C::VECTOR-LENGTH 1 7} => t23[EAX]
  16   4: MOVE t23[EAX] => t24[EBX]
  17
  18 --------------------------------------------------------------------------------
  19 #2
  20 (defun quux (v)
  21   (declare (optimize (speed 3) (safety 0) (space 2) (debug 0)))
  22   (declare (type (simple-array double-float 1) v))
  23   (let ((s 0d0))
  24     (declare (type double-float s))
  25     (dotimes (i (length v))
  26       (setq s (+ s (aref v i))))
  27     s))
  28
  29 * Python does not combine + with AREF, so generates extra move and
  30   allocates a register.
  31
  32 * On X86 Python thinks that all FP registers are directly accessible
  33   and emits costy MOVE ... => FR1.
  34
  35 --------------------------------------------------------------------------------
  36 #3
  37 (defun bar (n)
  38   (declare (optimize (speed 3) (safety 0) (space 2))
  39            (type fixnum n))
  40   (let ((v (make-list n)))
  41     (setq v (make-array n))
  42     (length v)))
  43
  44 * IR1 does not optimize away (MAKE-LIST N).
  45 --------------------------------------------------------------------------------
  46 #4
  47 (defun bar (v1 v2)
  48   (declare (optimize (speed 3) (safety 0) (space 2))
  49            (type (simple-array base-char 1) v1 v2))
  50   (dotimes (i (length v1))
  51     (setf (aref v2 i) (aref v1 i))))
  52
  53 VOP DATA-VECTOR-SET/SIMPLE-STRING V2!14[EDI] t32[EAX] t30[S2]>t33[CL]
  54                                   => t34[S2]<t35[AL]
  55         MOV     #<TN t33[CL]>, #<TN t30[S2]>
  56         MOV     BYTE PTR [EDI+EAX+1], #<TN t33[CL]>
  57         MOV     #<TN t35[AL]>, #<TN t33[CL]>
  58         MOV     #<TN t34[S2]>, #<TN t35[AL]>
  59
  60 * The value of DATA-VECTOR-SET is not used, so there is no need in the
  61   last two moves.
  62
  63 * And why two moves?
  64 --------------------------------------------------------------------------------
  65 #5
  66 (loop repeat 1.5)
  67
  68 uses generic arithmetic
  69 --------------------------------------------------------------------------------
  70 #6
  71 09:49:05 <jtra> I have found a case in those where suboptimal code is
  72   generate with nested loops, it might be moderately easy to fix that
  73 09:49:28 <jtra> see
  74   http://www.bagley.org/~doug/shootout/bench/nestedloop/nestedloop.cmucl
  75 09:50:30 <jtra> if you add declarations to dotimes, generated code is
  76   almost optimal, but most inner loops run out of registers and use
  77   memory location for iteration variable
  78
  79 ;;; -*- mode: lisp -*-
  80 ;;; http://www.bagley.org/~doug/shootout/
  81 ;;; from Friedrich Dominicus
  82
  83 (defun main ()
  84   (let ((n (parse-integer (or (car (last extensions:*command-line-strings*)) "1")))
  85         (x 0))
  86     (declare (fixnum n)
  87              (fixnum x)
  88              (optimize (speed 3) (debug 0) (safety 0)))
  89     (dotimes (a n)
  90       (dotimes (b n)
  91         (dotimes (c n)
  92           (dotimes (d n)
  93             (dotimes (e n)
  94               (dotimes (f n)
  95                 (incf x)))))))
  96    (format t "~A~%" x)))
  97 --------------------------------------------------------------------------------
  98 #7
  99 (defun foo (x)
 100   (declare (optimize speed (debug 0)))
 101   (if (< x 0) x (foo (1- x))))
 102
 103 SBCL generates a full call of FOO (but CMUCL does not).
 104 --------------------------------------------------------------------------------
 105 #8
 106 (defun foo (d)
 107   (declare (optimize (speed 3) (safety 0) (debug 0)))
 108   (declare (type (double-float 0d0 1d0) d))
 109   (loop for i fixnum from 1 to 5
 110      for x1 double-float = (sin d) ;;; !!!
 111      do (loop for j fixnum from 1 to 4
 112              sum x1 double-float)))
 113
 114 Without the marked declaration Python will use boxed representation for X1.
 115
 116 This is equivalent to
 117
 118 (let ((x nil))
 119   (setq x 0d0)
 120   ;; use of X as DOUBLE-FLOAT
 121 )
 122
 123 The initial binding is effectless, and without it X is of type
 124 DOUBLE-FLOAT. Unhopefully, IR1 does not optimize away effectless
 125 SETs/bindings, and IR2 does not perform type inference.
 126 --------------------------------------------------------------------------------
 127 #9 "Multi-path constant folding"
 128 (defun foo (x)
 129   (if (= (cond ((irgh x) 0)
 130                ((buh x) 1)
 131                (t 2))
 132          0)
 133       :yes
 134       :no))
 135
 136 This code could be optimized to
 137
 138 (defun foo (x)
 139   (cond ((irgh x) :yes)
 140         ((buh x) :no)
 141         (t :no)))
 142 --------------------------------------------------------------------------------
 143 #11
 144 (inverted variant of #9)
 145
 146 (lambda (x)
 147   (let ((y (sap-alien x c-string)))
 148     (list (alien-sap y)
 149           (alien-sap y))))
 150
 151 It could be optimized to
 152
 153 (lambda (x) (list x x))
 154
 155 (if Y were used only once, the current compiler would optimize it)
 156 --------------------------------------------------------------------------------
 157 #12
 158 (typep (truly-the (simple-array * (*)) x) 'simple-vector)
 159
 160 tests lowtag.
 161 --------------------------------------------------------------------------------
 162 #13
 163 FAST-+/FIXNUM and similar should accept unboxed arguments in interests
 164 of representation selection. Problem: inter-TN dependencies.
 165 --------------------------------------------------------------------------------