root/arch/mips/cavium-octeon/octeon-memcpy.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /*
   2  * This file is subject to the terms and conditions of the GNU General Public
   3  * License.  See the file "COPYING" in the main directory of this archive
   4  * for more details.
   5  *
   6  * Unified implementation of memcpy, memmove and the __copy_user backend.
   7  *
   8  * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
   9  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10  * Copyright (C) 2002 Broadcom, Inc.
  11  *   memcpy/copy_user author: Mark Vandevoorde
  12  *
  13  * Mnemonic names for arguments to memcpy/__copy_user
  14  */
  15 
  16 #include <asm/asm.h>
  17 #include <asm/asm-offsets.h>
  18 #include <asm/export.h>
  19 #include <asm/regdef.h>
  20 
  21 #define dst a0
  22 #define src a1
  23 #define len a2
  24 
  25 /*
  26  * Spec
  27  *
  28  * memcpy copies len bytes from src to dst and sets v0 to dst.
  29  * It assumes that
  30  *   - src and dst don't overlap
  31  *   - src is readable
  32  *   - dst is writable
  33  * memcpy uses the standard calling convention
  34  *
  35  * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  36  * the number of uncopied bytes due to an exception caused by a read or write.
  37  * __copy_user assumes that src and dst don't overlap, and that the call is
  38  * implementing one of the following:
  39  *   copy_to_user
  40  *     - src is readable  (no exceptions when reading src)
  41  *   copy_from_user
  42  *     - dst is writable  (no exceptions when writing dst)
  43  * __copy_user uses a non-standard calling convention; see
  44  * arch/mips/include/asm/uaccess.h
  45  *
  46  * When an exception happens on a load, the handler must
  47  # ensure that all of the destination buffer is overwritten to prevent
  48  * leaking information to user mode programs.
  49  */
  50 
  51 /*
  52  * Implementation
  53  */
  54 
  55 /*
  56  * The exception handler for loads requires that:
  57  *  1- AT contain the address of the byte just past the end of the source
  58  *     of the copy,
  59  *  2- src_entry <= src < AT, and
  60  *  3- (dst - src) == (dst_entry - src_entry),
  61  * The _entry suffix denotes values when __copy_user was called.
  62  *
  63  * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  64  * (2) is met by incrementing src by the number of bytes copied
  65  * (3) is met by not doing loads between a pair of increments of dst and src
  66  *
  67  * The exception handlers for stores adjust len (if necessary) and return.
  68  * These handlers do not need to overwrite any data.
  69  *
  70  * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  71  * they're not protected.
  72  */
  73 
  74 #define EXC(inst_reg,addr,handler)              \
  75 9:      inst_reg, addr;                         \
  76         .section __ex_table,"a";                \
  77         PTR     9b, handler;                    \
  78         .previous
  79 
  80 /*
  81  * Only on the 64-bit kernel we can made use of 64-bit registers.
  82  */
  83 
  84 #define LOAD   ld
  85 #define LOADL  ldl
  86 #define LOADR  ldr
  87 #define STOREL sdl
  88 #define STORER sdr
  89 #define STORE  sd
  90 #define ADD    daddu
  91 #define SUB    dsubu
  92 #define SRL    dsrl
  93 #define SRA    dsra
  94 #define SLL    dsll
  95 #define SLLV   dsllv
  96 #define SRLV   dsrlv
  97 #define NBYTES 8
  98 #define LOG_NBYTES 3
  99 
 100 /*
 101  * As we are sharing code base with the mips32 tree (which use the o32 ABI
 102  * register definitions). We need to redefine the register definitions from
 103  * the n64 ABI register naming to the o32 ABI register naming.
 104  */
 105 #undef t0
 106 #undef t1
 107 #undef t2
 108 #undef t3
 109 #define t0      $8
 110 #define t1      $9
 111 #define t2      $10
 112 #define t3      $11
 113 #define t4      $12
 114 #define t5      $13
 115 #define t6      $14
 116 #define t7      $15
 117 
 118 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 119 #define LDFIRST LOADR
 120 #define LDREST  LOADL
 121 #define STFIRST STORER
 122 #define STREST  STOREL
 123 #define SHIFT_DISCARD SLLV
 124 #else
 125 #define LDFIRST LOADL
 126 #define LDREST  LOADR
 127 #define STFIRST STOREL
 128 #define STREST  STORER
 129 #define SHIFT_DISCARD SRLV
 130 #endif
 131 
 132 #define FIRST(unit) ((unit)*NBYTES)
 133 #define REST(unit)  (FIRST(unit)+NBYTES-1)
 134 #define UNIT(unit)  FIRST(unit)
 135 
 136 #define ADDRMASK (NBYTES-1)
 137 
 138         .text
 139         .set    noreorder
 140         .set    noat
 141 
 142 /*
 143  * A combined memcpy/__copy_user
 144  * __copy_user sets len to 0 for success; else to an upper bound of
 145  * the number of uncopied bytes.
 146  * memcpy sets v0 to dst.
 147  */
 148         .align  5
 149 LEAF(memcpy)                                    /* a0=dst a1=src a2=len */
 150 EXPORT_SYMBOL(memcpy)
 151         move    v0, dst                         /* return value */
 152 __memcpy:
 153 FEXPORT(__copy_user)
 154 EXPORT_SYMBOL(__copy_user)
 155         /*
 156          * Note: dst & src may be unaligned, len may be 0
 157          * Temps
 158          */
 159         #
 160         # Octeon doesn't care if the destination is unaligned. The hardware
 161         # can fix it faster than we can special case the assembly.
 162         #
 163         pref    0, 0(src)
 164         sltu    t0, len, NBYTES         # Check if < 1 word
 165         bnez    t0, copy_bytes_checklen
 166          and    t0, src, ADDRMASK       # Check if src unaligned
 167         bnez    t0, src_unaligned
 168          sltu   t0, len, 4*NBYTES       # Check if < 4 words
 169         bnez    t0, less_than_4units
 170          sltu   t0, len, 8*NBYTES       # Check if < 8 words
 171         bnez    t0, less_than_8units
 172          sltu   t0, len, 16*NBYTES      # Check if < 16 words
 173         bnez    t0, cleanup_both_aligned
 174          sltu   t0, len, 128+1          # Check if len < 129
 175         bnez    t0, 1f                  # Skip prefetch if len is too short
 176          sltu   t0, len, 256+1          # Check if len < 257
 177         bnez    t0, 1f                  # Skip prefetch if len is too short
 178          pref   0, 128(src)             # We must not prefetch invalid addresses
 179         #
 180         # This is where we loop if there is more than 128 bytes left
 181 2:      pref    0, 256(src)             # We must not prefetch invalid addresses
 182         #
 183         # This is where we loop if we can't prefetch anymore
 184 1:
 185 EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 186 EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 187 EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 188 EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 189         SUB     len, len, 16*NBYTES
 190 EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p16u)
 191 EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p15u)
 192 EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p14u)
 193 EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p13u)
 194 EXC(    LOAD    t0, UNIT(4)(src),       l_exc_copy)
 195 EXC(    LOAD    t1, UNIT(5)(src),       l_exc_copy)
 196 EXC(    LOAD    t2, UNIT(6)(src),       l_exc_copy)
 197 EXC(    LOAD    t3, UNIT(7)(src),       l_exc_copy)
 198 EXC(    STORE   t0, UNIT(4)(dst),       s_exc_p12u)
 199 EXC(    STORE   t1, UNIT(5)(dst),       s_exc_p11u)
 200 EXC(    STORE   t2, UNIT(6)(dst),       s_exc_p10u)
 201         ADD     src, src, 16*NBYTES
 202 EXC(    STORE   t3, UNIT(7)(dst),       s_exc_p9u)
 203         ADD     dst, dst, 16*NBYTES
 204 EXC(    LOAD    t0, UNIT(-8)(src),      l_exc_copy_rewind16)
 205 EXC(    LOAD    t1, UNIT(-7)(src),      l_exc_copy_rewind16)
 206 EXC(    LOAD    t2, UNIT(-6)(src),      l_exc_copy_rewind16)
 207 EXC(    LOAD    t3, UNIT(-5)(src),      l_exc_copy_rewind16)
 208 EXC(    STORE   t0, UNIT(-8)(dst),      s_exc_p8u)
 209 EXC(    STORE   t1, UNIT(-7)(dst),      s_exc_p7u)
 210 EXC(    STORE   t2, UNIT(-6)(dst),      s_exc_p6u)
 211 EXC(    STORE   t3, UNIT(-5)(dst),      s_exc_p5u)
 212 EXC(    LOAD    t0, UNIT(-4)(src),      l_exc_copy_rewind16)
 213 EXC(    LOAD    t1, UNIT(-3)(src),      l_exc_copy_rewind16)
 214 EXC(    LOAD    t2, UNIT(-2)(src),      l_exc_copy_rewind16)
 215 EXC(    LOAD    t3, UNIT(-1)(src),      l_exc_copy_rewind16)
 216 EXC(    STORE   t0, UNIT(-4)(dst),      s_exc_p4u)
 217 EXC(    STORE   t1, UNIT(-3)(dst),      s_exc_p3u)
 218 EXC(    STORE   t2, UNIT(-2)(dst),      s_exc_p2u)
 219 EXC(    STORE   t3, UNIT(-1)(dst),      s_exc_p1u)
 220         sltu    t0, len, 256+1          # See if we can prefetch more
 221         beqz    t0, 2b
 222          sltu   t0, len, 128            # See if we can loop more time
 223         beqz    t0, 1b
 224          nop
 225         #
 226         # Jump here if there are less than 16*NBYTES left.
 227         #
 228 cleanup_both_aligned:
 229         beqz    len, done
 230          sltu   t0, len, 8*NBYTES
 231         bnez    t0, less_than_8units
 232          nop
 233 EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 234 EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 235 EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 236 EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 237         SUB     len, len, 8*NBYTES
 238 EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p8u)
 239 EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p7u)
 240 EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p6u)
 241 EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p5u)
 242 EXC(    LOAD    t0, UNIT(4)(src),       l_exc_copy)
 243 EXC(    LOAD    t1, UNIT(5)(src),       l_exc_copy)
 244 EXC(    LOAD    t2, UNIT(6)(src),       l_exc_copy)
 245 EXC(    LOAD    t3, UNIT(7)(src),       l_exc_copy)
 246 EXC(    STORE   t0, UNIT(4)(dst),       s_exc_p4u)
 247 EXC(    STORE   t1, UNIT(5)(dst),       s_exc_p3u)
 248 EXC(    STORE   t2, UNIT(6)(dst),       s_exc_p2u)
 249 EXC(    STORE   t3, UNIT(7)(dst),       s_exc_p1u)
 250         ADD     src, src, 8*NBYTES
 251         beqz    len, done
 252          ADD    dst, dst, 8*NBYTES
 253         #
 254         # Jump here if there are less than 8*NBYTES left.
 255         #
 256 less_than_8units:
 257         sltu    t0, len, 4*NBYTES
 258         bnez    t0, less_than_4units
 259          nop
 260 EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 261 EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 262 EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 263 EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 264         SUB     len, len, 4*NBYTES
 265 EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
 266 EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
 267 EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
 268 EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
 269         ADD     src, src, 4*NBYTES
 270         beqz    len, done
 271          ADD    dst, dst, 4*NBYTES
 272         #
 273         # Jump here if there are less than 4*NBYTES left. This means
 274         # we may need to copy up to 3 NBYTES words.
 275         #
 276 less_than_4units:
 277         sltu    t0, len, 1*NBYTES
 278         bnez    t0, copy_bytes_checklen
 279          nop
 280         #
 281         # 1) Copy NBYTES, then check length again
 282         #
 283 EXC(    LOAD    t0, 0(src),             l_exc)
 284         SUB     len, len, NBYTES
 285         sltu    t1, len, 8
 286 EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 287         ADD     src, src, NBYTES
 288         bnez    t1, copy_bytes_checklen
 289          ADD    dst, dst, NBYTES
 290         #
 291         # 2) Copy NBYTES, then check length again
 292         #
 293 EXC(    LOAD    t0, 0(src),             l_exc)
 294         SUB     len, len, NBYTES
 295         sltu    t1, len, 8
 296 EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 297         ADD     src, src, NBYTES
 298         bnez    t1, copy_bytes_checklen
 299          ADD    dst, dst, NBYTES
 300         #
 301         # 3) Copy NBYTES, then check length again
 302         #
 303 EXC(    LOAD    t0, 0(src),             l_exc)
 304         SUB     len, len, NBYTES
 305         ADD     src, src, NBYTES
 306         ADD     dst, dst, NBYTES
 307         b copy_bytes_checklen
 308 EXC(     STORE  t0, -8(dst),            s_exc_p1u)
 309 
 310 src_unaligned:
 311 #define rem t8
 312         SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 313         beqz    t0, cleanup_src_unaligned
 314          and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 315 1:
 316 /*
 317  * Avoid consecutive LD*'s to the same register since some mips
 318  * implementations can't issue them in the same cycle.
 319  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 320  * are to the same unit (unless src is aligned, but it's not).
 321  */
 322 EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 323 EXC(    LDFIRST t1, FIRST(1)(src),      l_exc_copy)
 324         SUB     len, len, 4*NBYTES
 325 EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 326 EXC(    LDREST  t1, REST(1)(src),       l_exc_copy)
 327 EXC(    LDFIRST t2, FIRST(2)(src),      l_exc_copy)
 328 EXC(    LDFIRST t3, FIRST(3)(src),      l_exc_copy)
 329 EXC(    LDREST  t2, REST(2)(src),       l_exc_copy)
 330 EXC(    LDREST  t3, REST(3)(src),       l_exc_copy)
 331         ADD     src, src, 4*NBYTES
 332 EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
 333 EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
 334 EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
 335 EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
 336         bne     len, rem, 1b
 337          ADD    dst, dst, 4*NBYTES
 338 
 339 cleanup_src_unaligned:
 340         beqz    len, done
 341          and    rem, len, NBYTES-1  # rem = len % NBYTES
 342         beq     rem, len, copy_bytes
 343          nop
 344 1:
 345 EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 346 EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 347         SUB     len, len, NBYTES
 348 EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 349         ADD     src, src, NBYTES
 350         bne     len, rem, 1b
 351          ADD    dst, dst, NBYTES
 352 
 353 copy_bytes_checklen:
 354         beqz    len, done
 355          nop
 356 copy_bytes:
 357         /* 0 < len < NBYTES  */
 358 #define COPY_BYTE(N)                    \
 359 EXC(    lb      t0, N(src), l_exc);     \
 360         SUB     len, len, 1;            \
 361         beqz    len, done;              \
 362 EXC(     sb     t0, N(dst), s_exc_p1)
 363 
 364         COPY_BYTE(0)
 365         COPY_BYTE(1)
 366         COPY_BYTE(2)
 367         COPY_BYTE(3)
 368         COPY_BYTE(4)
 369         COPY_BYTE(5)
 370 EXC(    lb      t0, NBYTES-2(src), l_exc)
 371         SUB     len, len, 1
 372         jr      ra
 373 EXC(     sb     t0, NBYTES-2(dst), s_exc_p1)
 374 done:
 375         jr      ra
 376          nop
 377         END(memcpy)
 378 
 379 l_exc_copy_rewind16:
 380         /* Rewind src and dst by 16*NBYTES for l_exc_copy */
 381         SUB     src, src, 16*NBYTES
 382         SUB     dst, dst, 16*NBYTES
 383 l_exc_copy:
 384         /*
 385          * Copy bytes from src until faulting load address (or until a
 386          * lb faults)
 387          *
 388          * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 389          * may be more than a byte beyond the last address.
 390          * Hence, the lb below may get an exception.
 391          *
 392          * Assumes src < THREAD_BUADDR($28)
 393          */
 394         LOAD    t0, TI_TASK($28)
 395         LOAD    t0, THREAD_BUADDR(t0)
 396 1:
 397 EXC(    lb      t1, 0(src),     l_exc)
 398         ADD     src, src, 1
 399         sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 400         bne     src, t0, 1b
 401          ADD    dst, dst, 1
 402 l_exc:
 403         LOAD    t0, TI_TASK($28)
 404         LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 405         SUB     len, AT, t0             # len number of uncopied bytes
 406         jr      ra
 407          nop
 408 
 409 
 410 #define SEXC(n)                         \
 411 s_exc_p ## n ## u:                      \
 412         jr      ra;                     \
 413          ADD    len, len, n*NBYTES
 414 
 415 SEXC(16)
 416 SEXC(15)
 417 SEXC(14)
 418 SEXC(13)
 419 SEXC(12)
 420 SEXC(11)
 421 SEXC(10)
 422 SEXC(9)
 423 SEXC(8)
 424 SEXC(7)
 425 SEXC(6)
 426 SEXC(5)
 427 SEXC(4)
 428 SEXC(3)
 429 SEXC(2)
 430 SEXC(1)
 431 
 432 s_exc_p1:
 433         jr      ra
 434          ADD    len, len, 1
 435 s_exc:
 436         jr      ra
 437          nop
 438 
 439         .align  5
 440 LEAF(memmove)
 441 EXPORT_SYMBOL(memmove)
 442         ADD     t0, a0, a2
 443         ADD     t1, a1, a2
 444         sltu    t0, a1, t0                      # dst + len <= src -> memcpy
 445         sltu    t1, a0, t1                      # dst >= src + len -> memcpy
 446         and     t0, t1
 447         beqz    t0, __memcpy
 448          move   v0, a0                          /* return value */
 449         beqz    a2, r_out
 450         END(memmove)
 451 
 452         /* fall through to __rmemcpy */
 453 LEAF(__rmemcpy)                                 /* a0=dst a1=src a2=len */
 454          sltu   t0, a1, a0
 455         beqz    t0, r_end_bytes_up              # src >= dst
 456          nop
 457         ADD     a0, a2                          # dst = dst + len
 458         ADD     a1, a2                          # src = src + len
 459 
 460 r_end_bytes:
 461         lb      t0, -1(a1)
 462         SUB     a2, a2, 0x1
 463         sb      t0, -1(a0)
 464         SUB     a1, a1, 0x1
 465         bnez    a2, r_end_bytes
 466          SUB    a0, a0, 0x1
 467 
 468 r_out:
 469         jr      ra
 470          move   a2, zero
 471 
 472 r_end_bytes_up:
 473         lb      t0, (a1)
 474         SUB     a2, a2, 0x1
 475         sb      t0, (a0)
 476         ADD     a1, a1, 0x1
 477         bnez    a2, r_end_bytes_up
 478          ADD    a0, a0, 0x1
 479 
 480         jr      ra
 481          move   a2, zero
 482         END(__rmemcpy)

/* [<][>][^][v][top][bottom][index][help] */