~ljy/RK356X_SDK_RELEASE.git

2023-02-13 e440ec23c5a540cdd3f7464e8779219be6fd3d95
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
/*
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file "COPYING" in the main directory of this archive
 * for more details.
 *
 * Quick'n'dirty IP checksum ...
 *
 * Copyright (C) 1998, 1999 Ralf Baechle
 * Copyright (C) 1999 Silicon Graphics, Inc.
 * Copyright (C) 2007  Maciej W. Rozycki
 * Copyright (C) 2014 Imagination Technologies Ltd.
 */
#include <linux/errno.h>
#include <asm/asm.h>
#include <asm/asm-offsets.h>
#include <asm/export.h>
#include <asm/regdef.h>
 
#ifdef CONFIG_64BIT
/*
 * As we are sharing code base with the mips32 tree (which use the o32 ABI
 * register definitions). We need to redefine the register definitions from
 * the n64 ABI register naming to the o32 ABI register naming.
 */
#undef t0
#undef t1
#undef t2
#undef t3
#define t0    $8
#define t1    $9
#define t2    $10
#define t3    $11
#define t4    $12
#define t5    $13
#define t6    $14
#define t7    $15
 
#define USE_DOUBLE
#endif
 
#ifdef USE_DOUBLE
 
#define LOAD   ld
#define LOAD32 lwu
#define ADD    daddu
#define NBYTES 8
 
#else
 
#define LOAD   lw
#define LOAD32 lw
#define ADD    addu
#define NBYTES 4
 
#endif /* USE_DOUBLE */
 
#define UNIT(unit)  ((unit)*NBYTES)
 
#define ADDC(sum,reg)                        \
    .set    push;                        \
    .set    noat;                        \
    ADD    sum, reg;                    \
    sltu    v1, sum, reg;                    \
    ADD    sum, v1;                    \
    .set    pop
 
#define ADDC32(sum,reg)                        \
    .set    push;                        \
    .set    noat;                        \
    addu    sum, reg;                    \
    sltu    v1, sum, reg;                    \
    addu    sum, v1;                    \
    .set    pop
 
#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)    \
    LOAD    _t0, (offset + UNIT(0))(src);            \
    LOAD    _t1, (offset + UNIT(1))(src);            \
    LOAD    _t2, (offset + UNIT(2))(src);            \
    LOAD    _t3, (offset + UNIT(3))(src);            \
    ADDC(_t0, _t1);                        \
    ADDC(_t2, _t3);                        \
    ADDC(sum, _t0);                        \
    ADDC(sum, _t2)
 
#ifdef USE_DOUBLE
#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)    \
    CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
#else
#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)    \
    CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);    \
    CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
#endif
 
/*
 * a0: source address
 * a1: length of the area to checksum
 * a2: partial checksum
 */
 
#define src a0
#define sum v0
 
    .text
    .set    noreorder
    .align    5
LEAF(csum_partial)
EXPORT_SYMBOL(csum_partial)
    move    sum, zero
    move    t7, zero
 
    sltiu    t8, a1, 0x8
    bnez    t8, .Lsmall_csumcpy        /* < 8 bytes to copy */
     move    t2, a1
 
    andi    t7, src, 0x1            /* odd buffer? */
 
.Lhword_align:
    beqz    t7, .Lword_align
     andi    t8, src, 0x2
 
    lbu    t0, (src)
    LONG_SUBU    a1, a1, 0x1
#ifdef __MIPSEL__
    sll    t0, t0, 8
#endif
    ADDC(sum, t0)
    PTR_ADDU    src, src, 0x1
    andi    t8, src, 0x2
 
.Lword_align:
    beqz    t8, .Ldword_align
     sltiu    t8, a1, 56
 
    lhu    t0, (src)
    LONG_SUBU    a1, a1, 0x2
    ADDC(sum, t0)
    sltiu    t8, a1, 56
    PTR_ADDU    src, src, 0x2
 
.Ldword_align:
    bnez    t8, .Ldo_end_words
     move    t8, a1
 
    andi    t8, src, 0x4
    beqz    t8, .Lqword_align
     andi    t8, src, 0x8
 
    LOAD32    t0, 0x00(src)
    LONG_SUBU    a1, a1, 0x4
    ADDC(sum, t0)
    PTR_ADDU    src, src, 0x4
    andi    t8, src, 0x8
 
.Lqword_align:
    beqz    t8, .Loword_align
     andi    t8, src, 0x10
 
#ifdef USE_DOUBLE
    ld    t0, 0x00(src)
    LONG_SUBU    a1, a1, 0x8
    ADDC(sum, t0)
#else
    lw    t0, 0x00(src)
    lw    t1, 0x04(src)
    LONG_SUBU    a1, a1, 0x8
    ADDC(sum, t0)
    ADDC(sum, t1)
#endif
    PTR_ADDU    src, src, 0x8
    andi    t8, src, 0x10
 
.Loword_align:
    beqz    t8, .Lbegin_movement
     LONG_SRL    t8, a1, 0x7
 
#ifdef USE_DOUBLE
    ld    t0, 0x00(src)
    ld    t1, 0x08(src)
    ADDC(sum, t0)
    ADDC(sum, t1)
#else
    CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
#endif
    LONG_SUBU    a1, a1, 0x10
    PTR_ADDU    src, src, 0x10
    LONG_SRL    t8, a1, 0x7
 
.Lbegin_movement:
    beqz    t8, 1f
     andi    t2, a1, 0x40
 
.Lmove_128bytes:
    CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
    CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
    CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
    CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
    LONG_SUBU    t8, t8, 0x01
    .set    reorder                /* DADDI_WAR */
    PTR_ADDU    src, src, 0x80
    bnez    t8, .Lmove_128bytes
    .set    noreorder
 
1:
    beqz    t2, 1f
     andi    t2, a1, 0x20
 
.Lmove_64bytes:
    CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
    CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
    PTR_ADDU    src, src, 0x40
 
1:
    beqz    t2, .Ldo_end_words
     andi    t8, a1, 0x1c
 
.Lmove_32bytes:
    CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
    andi    t8, a1, 0x1c
    PTR_ADDU    src, src, 0x20
 
.Ldo_end_words:
    beqz    t8, .Lsmall_csumcpy
     andi    t2, a1, 0x3
    LONG_SRL    t8, t8, 0x2
 
.Lend_words:
    LOAD32    t0, (src)
    LONG_SUBU    t8, t8, 0x1
    ADDC(sum, t0)
    .set    reorder                /* DADDI_WAR */
    PTR_ADDU    src, src, 0x4
    bnez    t8, .Lend_words
    .set    noreorder
 
/* unknown src alignment and < 8 bytes to go  */
.Lsmall_csumcpy:
    move    a1, t2
 
    andi    t0, a1, 4
    beqz    t0, 1f
     andi    t0, a1, 2
 
    /* Still a full word to go  */
    ulw    t1, (src)
    PTR_ADDIU    src, 4
#ifdef USE_DOUBLE
    dsll    t1, t1, 32            /* clear lower 32bit */
#endif
    ADDC(sum, t1)
 
1:    move    t1, zero
    beqz    t0, 1f
     andi    t0, a1, 1
 
    /* Still a halfword to go  */
    ulhu    t1, (src)
    PTR_ADDIU    src, 2
 
1:    beqz    t0, 1f
     sll    t1, t1, 16
 
    lbu    t2, (src)
     nop
 
#ifdef __MIPSEB__
    sll    t2, t2, 8
#endif
    or    t1, t2
 
1:    ADDC(sum, t1)
 
    /* fold checksum */
#ifdef USE_DOUBLE
    dsll32    v1, sum, 0
    daddu    sum, v1
    sltu    v1, sum, v1
    dsra32    sum, sum, 0
    addu    sum, v1
#endif
 
    /* odd buffer alignment? */
#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
    .set    push
    .set    arch=mips32r2
    wsbh    v1, sum
    movn    sum, v1, t7
    .set    pop
#else
    beqz    t7, 1f            /* odd buffer alignment? */
     lui    v1, 0x00ff
    addu    v1, 0x00ff
    and    t0, sum, v1
    sll    t0, t0, 8
    srl    sum, sum, 8
    and    sum, sum, v1
    or    sum, sum, t0
1:
#endif
    .set    reorder
    /* Add the passed partial csum.     */
    ADDC32(sum, a2)
    jr    ra
    .set    noreorder
    END(csum_partial)
 
 
/*
 * checksum and copy routines based on memcpy.S
 *
 *    csum_partial_copy_nocheck(src, dst, len, sum)
 *    __csum_partial_copy_kernel(src, dst, len, sum, errp)
 *
 * See "Spec" in memcpy.S for details.    Unlike __copy_user, all
 * function in this file use the standard calling convention.
 */
 
#define src a0
#define dst a1
#define len a2
#define psum a3
#define sum v0
#define odd t8
#define errptr t9
 
/*
 * The exception handler for loads requires that:
 *  1- AT contain the address of the byte just past the end of the source
 *     of the copy,
 *  2- src_entry <= src < AT, and
 *  3- (dst - src) == (dst_entry - src_entry),
 * The _entry suffix denotes values when __copy_user was called.
 *
 * (1) is set up up by __csum_partial_copy_from_user and maintained by
 *    not writing AT in __csum_partial_copy
 * (2) is met by incrementing src by the number of bytes copied
 * (3) is met by not doing loads between a pair of increments of dst and src
 *
 * The exception handlers for stores stores -EFAULT to errptr and return.
 * These handlers do not need to overwrite any data.
 */
 
/* Instruction type */
#define LD_INSN 1
#define ST_INSN 2
#define LEGACY_MODE 1
#define EVA_MODE    2
#define USEROP   1
#define KERNELOP 2
 
/*
 * Wrapper to add an entry in the exception table
 * in case the insn causes a memory exception.
 * Arguments:
 * insn    : Load/store instruction
 * type    : Instruction type
 * reg     : Register
 * addr    : Address
 * handler : Exception handler
 */
#define EXC(insn, type, reg, addr, handler)    \
    .if \mode == LEGACY_MODE;        \
9:        insn reg, addr;            \
        .section __ex_table,"a";    \
        PTR    9b, handler;        \
        .previous;            \
    /* This is enabled in EVA mode */    \
    .else;                    \
        /* If loading from user or storing to user */    \
        .if ((\from == USEROP) && (type == LD_INSN)) || \
            ((\to == USEROP) && (type == ST_INSN));    \
9:            __BUILD_EVA_INSN(insn##e, reg, addr);    \
            .section __ex_table,"a";        \
            PTR    9b, handler;            \
            .previous;                \
        .else;                        \
            /* EVA without exception */        \
            insn reg, addr;                \
        .endif;                        \
    .endif
 
#undef LOAD
 
#ifdef USE_DOUBLE
 
#define LOADK    ld /* No exception */
#define LOAD(reg, addr, handler)    EXC(ld, LD_INSN, reg, addr, handler)
#define LOADBU(reg, addr, handler)    EXC(lbu, LD_INSN, reg, addr, handler)
#define LOADL(reg, addr, handler)    EXC(ldl, LD_INSN, reg, addr, handler)
#define LOADR(reg, addr, handler)    EXC(ldr, LD_INSN, reg, addr, handler)
#define STOREB(reg, addr, handler)    EXC(sb, ST_INSN, reg, addr, handler)
#define STOREL(reg, addr, handler)    EXC(sdl, ST_INSN, reg, addr, handler)
#define STORER(reg, addr, handler)    EXC(sdr, ST_INSN, reg, addr, handler)
#define STORE(reg, addr, handler)    EXC(sd, ST_INSN, reg, addr, handler)
#define ADD    daddu
#define SUB    dsubu
#define SRL    dsrl
#define SLL    dsll
#define SLLV   dsllv
#define SRLV   dsrlv
#define NBYTES 8
#define LOG_NBYTES 3
 
#else
 
#define LOADK    lw /* No exception */
#define LOAD(reg, addr, handler)    EXC(lw, LD_INSN, reg, addr, handler)
#define LOADBU(reg, addr, handler)    EXC(lbu, LD_INSN, reg, addr, handler)
#define LOADL(reg, addr, handler)    EXC(lwl, LD_INSN, reg, addr, handler)
#define LOADR(reg, addr, handler)    EXC(lwr, LD_INSN, reg, addr, handler)
#define STOREB(reg, addr, handler)    EXC(sb, ST_INSN, reg, addr, handler)
#define STOREL(reg, addr, handler)    EXC(swl, ST_INSN, reg, addr, handler)
#define STORER(reg, addr, handler)    EXC(swr, ST_INSN, reg, addr, handler)
#define STORE(reg, addr, handler)    EXC(sw, ST_INSN, reg, addr, handler)
#define ADD    addu
#define SUB    subu
#define SRL    srl
#define SLL    sll
#define SLLV   sllv
#define SRLV   srlv
#define NBYTES 4
#define LOG_NBYTES 2
 
#endif /* USE_DOUBLE */
 
#ifdef CONFIG_CPU_LITTLE_ENDIAN
#define LDFIRST LOADR
#define LDREST    LOADL
#define STFIRST STORER
#define STREST    STOREL
#define SHIFT_DISCARD SLLV
#define SHIFT_DISCARD_REVERT SRLV
#else
#define LDFIRST LOADL
#define LDREST    LOADR
#define STFIRST STOREL
#define STREST    STORER
#define SHIFT_DISCARD SRLV
#define SHIFT_DISCARD_REVERT SLLV
#endif
 
#define FIRST(unit) ((unit)*NBYTES)
#define REST(unit)  (FIRST(unit)+NBYTES-1)
 
#define ADDRMASK (NBYTES-1)
 
#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
    .set    noat
#else
    .set    at=v1
#endif
 
    .macro __BUILD_CSUM_PARTIAL_COPY_USER mode, from, to, __nocheck
 
    PTR_ADDU    AT, src, len    /* See (1) above. */
    /* initialize __nocheck if this the first time we execute this
     * macro
     */
#ifdef CONFIG_64BIT
    move    errptr, a4
#else
    lw    errptr, 16(sp)
#endif
    .if \__nocheck == 1
    FEXPORT(csum_partial_copy_nocheck)
    EXPORT_SYMBOL(csum_partial_copy_nocheck)
    .endif
    move    sum, zero
    move    odd, zero
    /*
     * Note: dst & src may be unaligned, len may be 0
     * Temps
     */
    /*
     * The "issue break"s below are very approximate.
     * Issue delays for dcache fills will perturb the schedule, as will
     * load queue full replay traps, etc.
     *
     * If len < NBYTES use byte operations.
     */
    sltu    t2, len, NBYTES
    and    t1, dst, ADDRMASK
    bnez    t2, .Lcopy_bytes_checklen\@
     and    t0, src, ADDRMASK
    andi    odd, dst, 0x1            /* odd buffer? */
    bnez    t1, .Ldst_unaligned\@
     nop
    bnez    t0, .Lsrc_unaligned_dst_aligned\@
    /*
     * use delay slot for fall-through
     * src and dst are aligned; need to compute rem
     */
.Lboth_aligned\@:
     SRL    t0, len, LOG_NBYTES+3     # +3 for 8 units/iter
    beqz    t0, .Lcleanup_both_aligned\@ # len < 8*NBYTES
     nop
    SUB    len, 8*NBYTES        # subtract here for bgez loop
    .align    4
1:
    LOAD(t0, UNIT(0)(src), .Ll_exc\@)
    LOAD(t1, UNIT(1)(src), .Ll_exc_copy\@)
    LOAD(t2, UNIT(2)(src), .Ll_exc_copy\@)
    LOAD(t3, UNIT(3)(src), .Ll_exc_copy\@)
    LOAD(t4, UNIT(4)(src), .Ll_exc_copy\@)
    LOAD(t5, UNIT(5)(src), .Ll_exc_copy\@)
    LOAD(t6, UNIT(6)(src), .Ll_exc_copy\@)
    LOAD(t7, UNIT(7)(src), .Ll_exc_copy\@)
    SUB    len, len, 8*NBYTES
    ADD    src, src, 8*NBYTES
    STORE(t0, UNIT(0)(dst),    .Ls_exc\@)
    ADDC(t0, t1)
    STORE(t1, UNIT(1)(dst),    .Ls_exc\@)
    ADDC(sum, t0)
    STORE(t2, UNIT(2)(dst),    .Ls_exc\@)
    ADDC(t2, t3)
    STORE(t3, UNIT(3)(dst),    .Ls_exc\@)
    ADDC(sum, t2)
    STORE(t4, UNIT(4)(dst),    .Ls_exc\@)
    ADDC(t4, t5)
    STORE(t5, UNIT(5)(dst),    .Ls_exc\@)
    ADDC(sum, t4)
    STORE(t6, UNIT(6)(dst),    .Ls_exc\@)
    ADDC(t6, t7)
    STORE(t7, UNIT(7)(dst),    .Ls_exc\@)
    ADDC(sum, t6)
    .set    reorder                /* DADDI_WAR */
    ADD    dst, dst, 8*NBYTES
    bgez    len, 1b
    .set    noreorder
    ADD    len, 8*NBYTES        # revert len (see above)
 
    /*
     * len == the number of bytes left to copy < 8*NBYTES
     */
.Lcleanup_both_aligned\@:
#define rem t7
    beqz    len, .Ldone\@
     sltu    t0, len, 4*NBYTES
    bnez    t0, .Lless_than_4units\@
     and    rem, len, (NBYTES-1)    # rem = len % NBYTES
    /*
     * len >= 4*NBYTES
     */
    LOAD(t0, UNIT(0)(src), .Ll_exc\@)
    LOAD(t1, UNIT(1)(src), .Ll_exc_copy\@)
    LOAD(t2, UNIT(2)(src), .Ll_exc_copy\@)
    LOAD(t3, UNIT(3)(src), .Ll_exc_copy\@)
    SUB    len, len, 4*NBYTES
    ADD    src, src, 4*NBYTES
    STORE(t0, UNIT(0)(dst),    .Ls_exc\@)
    ADDC(t0, t1)
    STORE(t1, UNIT(1)(dst),    .Ls_exc\@)
    ADDC(sum, t0)
    STORE(t2, UNIT(2)(dst),    .Ls_exc\@)
    ADDC(t2, t3)
    STORE(t3, UNIT(3)(dst),    .Ls_exc\@)
    ADDC(sum, t2)
    .set    reorder                /* DADDI_WAR */
    ADD    dst, dst, 4*NBYTES
    beqz    len, .Ldone\@
    .set    noreorder
.Lless_than_4units\@:
    /*
     * rem = len % NBYTES
     */
    beq    rem, len, .Lcopy_bytes\@
     nop
1:
    LOAD(t0, 0(src), .Ll_exc\@)
    ADD    src, src, NBYTES
    SUB    len, len, NBYTES
    STORE(t0, 0(dst), .Ls_exc\@)
    ADDC(sum, t0)
    .set    reorder                /* DADDI_WAR */
    ADD    dst, dst, NBYTES
    bne    rem, len, 1b
    .set    noreorder
 
    /*
     * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
     * A loop would do only a byte at a time with possible branch
     * mispredicts.     Can't do an explicit LOAD dst,mask,or,STORE
     * because can't assume read-access to dst.  Instead, use
     * STREST dst, which doesn't require read access to dst.
     *
     * This code should perform better than a simple loop on modern,
     * wide-issue mips processors because the code has fewer branches and
     * more instruction-level parallelism.
     */
#define bits t2
    beqz    len, .Ldone\@
     ADD    t1, dst, len    # t1 is just past last byte of dst
    li    bits, 8*NBYTES
    SLL    rem, len, 3    # rem = number of bits to keep
    LOAD(t0, 0(src), .Ll_exc\@)
    SUB    bits, bits, rem # bits = number of bits to discard
    SHIFT_DISCARD t0, t0, bits
    STREST(t0, -1(t1), .Ls_exc\@)
    SHIFT_DISCARD_REVERT t0, t0, bits
    .set reorder
    ADDC(sum, t0)
    b    .Ldone\@
    .set noreorder
.Ldst_unaligned\@:
    /*
     * dst is unaligned
     * t0 = src & ADDRMASK
     * t1 = dst & ADDRMASK; T1 > 0
     * len >= NBYTES
     *
     * Copy enough bytes to align dst
     * Set match = (src and dst have same alignment)
     */
#define match rem
    LDFIRST(t3, FIRST(0)(src), .Ll_exc\@)
    ADD    t2, zero, NBYTES
    LDREST(t3, REST(0)(src), .Ll_exc_copy\@)
    SUB    t2, t2, t1    # t2 = number of bytes copied
    xor    match, t0, t1
    STFIRST(t3, FIRST(0)(dst), .Ls_exc\@)
    SLL    t4, t1, 3        # t4 = number of bits to discard
    SHIFT_DISCARD t3, t3, t4
    /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
    ADDC(sum, t3)
    beq    len, t2, .Ldone\@
     SUB    len, len, t2
    ADD    dst, dst, t2
    beqz    match, .Lboth_aligned\@
     ADD    src, src, t2
 
.Lsrc_unaligned_dst_aligned\@:
    SRL    t0, len, LOG_NBYTES+2     # +2 for 4 units/iter
    beqz    t0, .Lcleanup_src_unaligned\@
     and    rem, len, (4*NBYTES-1)     # rem = len % 4*NBYTES
1:
/*
 * Avoid consecutive LD*'s to the same register since some mips
 * implementations can't issue them in the same cycle.
 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 * are to the same unit (unless src is aligned, but it's not).
 */
    LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)
    LDFIRST(t1, FIRST(1)(src), .Ll_exc_copy\@)
    SUB    len, len, 4*NBYTES
    LDREST(t0, REST(0)(src), .Ll_exc_copy\@)
    LDREST(t1, REST(1)(src), .Ll_exc_copy\@)
    LDFIRST(t2, FIRST(2)(src), .Ll_exc_copy\@)
    LDFIRST(t3, FIRST(3)(src), .Ll_exc_copy\@)
    LDREST(t2, REST(2)(src), .Ll_exc_copy\@)
    LDREST(t3, REST(3)(src), .Ll_exc_copy\@)
    ADD    src, src, 4*NBYTES
#ifdef CONFIG_CPU_SB1
    nop                # improves slotting
#endif
    STORE(t0, UNIT(0)(dst),    .Ls_exc\@)
    ADDC(t0, t1)
    STORE(t1, UNIT(1)(dst),    .Ls_exc\@)
    ADDC(sum, t0)
    STORE(t2, UNIT(2)(dst),    .Ls_exc\@)
    ADDC(t2, t3)
    STORE(t3, UNIT(3)(dst),    .Ls_exc\@)
    ADDC(sum, t2)
    .set    reorder                /* DADDI_WAR */
    ADD    dst, dst, 4*NBYTES
    bne    len, rem, 1b
    .set    noreorder
 
.Lcleanup_src_unaligned\@:
    beqz    len, .Ldone\@
     and    rem, len, NBYTES-1  # rem = len % NBYTES
    beq    rem, len, .Lcopy_bytes\@
     nop
1:
    LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)
    LDREST(t0, REST(0)(src), .Ll_exc_copy\@)
    ADD    src, src, NBYTES
    SUB    len, len, NBYTES
    STORE(t0, 0(dst), .Ls_exc\@)
    ADDC(sum, t0)
    .set    reorder                /* DADDI_WAR */
    ADD    dst, dst, NBYTES
    bne    len, rem, 1b
    .set    noreorder
 
.Lcopy_bytes_checklen\@:
    beqz    len, .Ldone\@
     nop
.Lcopy_bytes\@:
    /* 0 < len < NBYTES  */
#ifdef CONFIG_CPU_LITTLE_ENDIAN
#define SHIFT_START 0
#define SHIFT_INC 8
#else
#define SHIFT_START 8*(NBYTES-1)
#define SHIFT_INC -8
#endif
    move    t2, zero    # partial word
    li    t3, SHIFT_START # shift
/* use .Ll_exc_copy here to return correct sum on fault */
#define COPY_BYTE(N)            \
    LOADBU(t0, N(src), .Ll_exc_copy\@);    \
    SUB    len, len, 1;        \
    STOREB(t0, N(dst), .Ls_exc\@);    \
    SLLV    t0, t0, t3;        \
    addu    t3, SHIFT_INC;        \
    beqz    len, .Lcopy_bytes_done\@; \
     or    t2, t0
 
    COPY_BYTE(0)
    COPY_BYTE(1)
#ifdef USE_DOUBLE
    COPY_BYTE(2)
    COPY_BYTE(3)
    COPY_BYTE(4)
    COPY_BYTE(5)
#endif
    LOADBU(t0, NBYTES-2(src), .Ll_exc_copy\@)
    SUB    len, len, 1
    STOREB(t0, NBYTES-2(dst), .Ls_exc\@)
    SLLV    t0, t0, t3
    or    t2, t0
.Lcopy_bytes_done\@:
    ADDC(sum, t2)
.Ldone\@:
    /* fold checksum */
    .set    push
    .set    noat
#ifdef USE_DOUBLE
    dsll32    v1, sum, 0
    daddu    sum, v1
    sltu    v1, sum, v1
    dsra32    sum, sum, 0
    addu    sum, v1
#endif
 
#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
    .set    push
    .set    arch=mips32r2
    wsbh    v1, sum
    movn    sum, v1, odd
    .set    pop
#else
    beqz    odd, 1f            /* odd buffer alignment? */
     lui    v1, 0x00ff
    addu    v1, 0x00ff
    and    t0, sum, v1
    sll    t0, t0, 8
    srl    sum, sum, 8
    and    sum, sum, v1
    or    sum, sum, t0
1:
#endif
    .set    pop
    .set reorder
    ADDC32(sum, psum)
    jr    ra
    .set noreorder
 
.Ll_exc_copy\@:
    /*
     * Copy bytes from src until faulting load address (or until a
     * lb faults)
     *
     * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
     * may be more than a byte beyond the last address.
     * Hence, the lb below may get an exception.
     *
     * Assumes src < THREAD_BUADDR($28)
     */
    LOADK    t0, TI_TASK($28)
     li    t2, SHIFT_START
    LOADK    t0, THREAD_BUADDR(t0)
1:
    LOADBU(t1, 0(src), .Ll_exc\@)
    ADD    src, src, 1
    sb    t1, 0(dst)    # can't fault -- we're copy_from_user
    SLLV    t1, t1, t2
    addu    t2, SHIFT_INC
    ADDC(sum, t1)
    .set    reorder                /* DADDI_WAR */
    ADD    dst, dst, 1
    bne    src, t0, 1b
    .set    noreorder
.Ll_exc\@:
    LOADK    t0, TI_TASK($28)
     nop
    LOADK    t0, THREAD_BUADDR(t0)    # t0 is just past last good address
     nop
    SUB    len, AT, t0        # len number of uncopied bytes
    /*
     * Here's where we rely on src and dst being incremented in tandem,
     *   See (3) above.
     * dst += (fault addr - src) to put dst at first byte to clear
     */
    ADD    dst, t0            # compute start address in a1
    SUB    dst, src
    /*
     * Clear len bytes starting at dst.  Can't call __bzero because it
     * might modify len.  An inefficient loop for these rare times...
     */
    .set    reorder                /* DADDI_WAR */
    SUB    src, len, 1
    beqz    len, .Ldone\@
    .set    noreorder
1:    sb    zero, 0(dst)
    ADD    dst, dst, 1
    .set    push
    .set    noat
#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
    bnez    src, 1b
     SUB    src, src, 1
#else
    li    v1, 1
    bnez    src, 1b
     SUB    src, src, v1
#endif
    li    v1, -EFAULT
    b    .Ldone\@
     sw    v1, (errptr)
 
.Ls_exc\@:
    li    v0, -1 /* invalid checksum */
    li    v1, -EFAULT
    jr    ra
     sw    v1, (errptr)
    .set    pop
    .endm
 
LEAF(__csum_partial_copy_kernel)
EXPORT_SYMBOL(__csum_partial_copy_kernel)
#ifndef CONFIG_EVA
FEXPORT(__csum_partial_copy_to_user)
EXPORT_SYMBOL(__csum_partial_copy_to_user)
FEXPORT(__csum_partial_copy_from_user)
EXPORT_SYMBOL(__csum_partial_copy_from_user)
#endif
__BUILD_CSUM_PARTIAL_COPY_USER LEGACY_MODE USEROP USEROP 1
END(__csum_partial_copy_kernel)
 
#ifdef CONFIG_EVA
LEAF(__csum_partial_copy_to_user)
__BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE KERNELOP USEROP 0
END(__csum_partial_copy_to_user)
 
LEAF(__csum_partial_copy_from_user)
__BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE USEROP KERNELOP 0
END(__csum_partial_copy_from_user)
#endif