LoongArch: mm: Refactor TLB exception handlers
authorRui Wang <wangrui@loongson.cn>
Wed, 12 Oct 2022 08:36:14 +0000 (16:36 +0800)
committerHuacai Chen <chenhuacai@loongson.cn>
Wed, 12 Oct 2022 08:36:14 +0000 (16:36 +0800)
This patch simplifies TLB load, store and modify exception handlers:

1. Reduce instructions, such as alu/csr and memory access;
2. Execute tlb search instruction only in the fast path;
3. Return directly from the fast path for both normal and huge pages;
4. Re-tab the assembly for better vertical alignment.

And fixes the concurrent modification issue of fast path for huge pages.

This issue will occur in the following steps:

   CPU-1 (In TLB exception)         CPU-2 (In THP splitting)
1: Load PMD entry (HUGE=1)
2: Goto huge path
3:                                  Store PMD entry (HUGE=0)
4: Reload PMD entry (HUGE=0)
5: Fill TLB entry (PA is incorrect)

This patch also slightly improves the TLB processing performance:

* Normal pages: 2.15%, Huge pages: 1.70%.

  #include <stdio.h>
  #include <stdlib.h>
  #include <unistd.h>
  #include <sys/mman.h>

  int main(int argc, char *argv[])
  {
        size_t page_size;
        size_t mem_size;
        size_t off;
        void *base;
        int flags;
        int i;

        if (argc < 2) {
                fprintf(stderr, "%s MEM_SIZE [HUGE]\n", argv[0]);
                return -1;
        }

        page_size = sysconf(_SC_PAGESIZE);
        flags = MAP_PRIVATE | MAP_ANONYMOUS;
        mem_size = strtoul(argv[1], NULL, 10);
        if (argc > 2)
                flags |= MAP_HUGETLB;

        for (i = 0; i < 10; i++) {
                base = mmap(NULL, mem_size, PROT_READ, flags, -1, 0);
                if (base == MAP_FAILED) {
                        fprintf(stderr, "Map memory failed!\n");
                        return -1;
                }

                for (off = 0; off < mem_size; off += page_size)
                        *(volatile int *)(base + off);

                munmap(base, mem_size);
        }

        return 0;
  }

Signed-off-by: Rui Wang <wangrui@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
arch/loongarch/mm/tlbex.S

index 39743337999e98f5458fe08a42d72e120ffc7d1b..d8ee8fbc8c67321b92adbeabe9eea923aa2780ff 100644 (file)
 #include <asm/regdef.h>
 #include <asm/stackframe.h>
 
+#define PTRS_PER_PGD_BITS      (PAGE_SHIFT - 3)
+#define PTRS_PER_PUD_BITS      (PAGE_SHIFT - 3)
+#define PTRS_PER_PMD_BITS      (PAGE_SHIFT - 3)
+#define PTRS_PER_PTE_BITS      (PAGE_SHIFT - 3)
+
        .macro tlb_do_page_fault, write
        SYM_FUNC_START(tlb_do_page_fault_\write)
        SAVE_ALL
-       csrrd   a2, LOONGARCH_CSR_BADV
-       move    a0, sp
-       REG_S   a2, sp, PT_BVADDR
-       li.w    a1, \write
-       la.abs  t0, do_page_fault
-       jirl    ra, t0, 0
+       csrrd           a2, LOONGARCH_CSR_BADV
+       move            a0, sp
+       REG_S           a2, sp, PT_BVADDR
+       li.w            a1, \write
+       la.abs          t0, do_page_fault
+       jirl            ra, t0, 0
        RESTORE_ALL_AND_RET
        SYM_FUNC_END(tlb_do_page_fault_\write)
        .endm
 SYM_FUNC_START(handle_tlb_protect)
        BACKUP_T0T1
        SAVE_ALL
-       move    a0, sp
-       move    a1, zero
-       csrrd   a2, LOONGARCH_CSR_BADV
-       REG_S   a2, sp, PT_BVADDR
-       la.abs  t0, do_page_fault
-       jirl    ra, t0, 0
+       move            a0, sp
+       move            a1, zero
+       csrrd           a2, LOONGARCH_CSR_BADV
+       REG_S           a2, sp, PT_BVADDR
+       la.abs          t0, do_page_fault
+       jirl            ra, t0, 0
        RESTORE_ALL_AND_RET
 SYM_FUNC_END(handle_tlb_protect)
 
 SYM_FUNC_START(handle_tlb_load)
-       csrwr   t0, EXCEPTION_KS0
-       csrwr   t1, EXCEPTION_KS1
-       csrwr   ra, EXCEPTION_KS2
+       csrwr           t0, EXCEPTION_KS0
+       csrwr           t1, EXCEPTION_KS1
+       csrwr           ra, EXCEPTION_KS2
 
        /*
         * The vmalloc handling is not in the hotpath.
         */
-       csrrd   t0, LOONGARCH_CSR_BADV
-       bltz    t0, vmalloc_load
-       csrrd   t1, LOONGARCH_CSR_PGDL
+       csrrd           t0, LOONGARCH_CSR_BADV
+       bltz            t0, vmalloc_load
+       csrrd           t1, LOONGARCH_CSR_PGDL
 
 vmalloc_done_load:
        /* Get PGD offset in bytes */
-       srli.d  t0, t0, PGDIR_SHIFT
-       andi    t0, t0, (PTRS_PER_PGD - 1)
-       slli.d  t0, t0, 3
-       add.d   t1, t1, t0
+       bstrpick.d      ra, t0, PTRS_PER_PGD_BITS + PGDIR_SHIFT - 1, PGDIR_SHIFT
+       alsl.d          t1, ra, t1, 3
 #if CONFIG_PGTABLE_LEVELS > 3
-       csrrd   t0, LOONGARCH_CSR_BADV
-       ld.d    t1, t1, 0
-       srli.d  t0, t0, PUD_SHIFT
-       andi    t0, t0, (PTRS_PER_PUD - 1)
-       slli.d  t0, t0, 3
-       add.d   t1, t1, t0
+       ld.d            t1, t1, 0
+       bstrpick.d      ra, t0, PTRS_PER_PUD_BITS + PUD_SHIFT - 1, PUD_SHIFT
+       alsl.d          t1, ra, t1, 3
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
-       csrrd   t0, LOONGARCH_CSR_BADV
-       ld.d    t1, t1, 0
-       srli.d  t0, t0, PMD_SHIFT
-       andi    t0, t0, (PTRS_PER_PMD - 1)
-       slli.d  t0, t0, 3
-       add.d   t1, t1, t0
+       ld.d            t1, t1, 0
+       bstrpick.d      ra, t0, PTRS_PER_PMD_BITS + PMD_SHIFT - 1, PMD_SHIFT
+       alsl.d          t1, ra, t1, 3
 #endif
-       ld.d    ra, t1, 0
+       ld.d            ra, t1, 0
 
        /*
         * For huge tlb entries, pmde doesn't contain an address but
         * instead contains the tlb pte. Check the PAGE_HUGE bit and
         * see if we need to jump to huge tlb processing.
         */
-       andi    t0, ra, _PAGE_HUGE
-       bnez    t0, tlb_huge_update_load
+       rotri.d         ra, ra, _PAGE_HUGE_SHIFT + 1
+       bltz            ra, tlb_huge_update_load
 
-       csrrd   t0, LOONGARCH_CSR_BADV
-       srli.d  t0, t0, PAGE_SHIFT
-       andi    t0, t0, (PTRS_PER_PTE - 1)
-       slli.d  t0, t0, _PTE_T_LOG2
-       add.d   t1, ra, t0
+       rotri.d         ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
+       bstrpick.d      t0, t0, PTRS_PER_PTE_BITS + PAGE_SHIFT - 1, PAGE_SHIFT
+       alsl.d          t1, t0, ra, _PTE_T_LOG2
 
 #ifdef CONFIG_SMP
 smp_pgtable_change_load:
-#endif
-#ifdef CONFIG_SMP
-       ll.d    t0, t1, 0
+       ll.d            t0, t1, 0
 #else
-       ld.d    t0, t1, 0
+       ld.d            t0, t1, 0
 #endif
-       tlbsrch
-
-       srli.d  ra, t0, _PAGE_PRESENT_SHIFT
-       andi    ra, ra, 1
-       beqz    ra, nopage_tlb_load
+       andi            ra, t0, _PAGE_PRESENT
+       beqz            ra, nopage_tlb_load
 
-       ori     t0, t0, _PAGE_VALID
+       ori             t0, t0, _PAGE_VALID
 #ifdef CONFIG_SMP
-       sc.d    t0, t1, 0
-       beqz    t0, smp_pgtable_change_load
+       sc.d            t0, t1, 0
+       beqz            t0, smp_pgtable_change_load
 #else
-       st.d    t0, t1, 0
+       st.d            t0, t1, 0
 #endif
-       ori     t1, t1, 8
-       xori    t1, t1, 8
-       ld.d    t0, t1, 0
-       ld.d    t1, t1, 8
-       csrwr   t0, LOONGARCH_CSR_TLBELO0
-       csrwr   t1, LOONGARCH_CSR_TLBELO1
+       tlbsrch
+       bstrins.d       t1, zero, 3, 3
+       ld.d            t0, t1, 0
+       ld.d            t1, t1, 8
+       csrwr           t0, LOONGARCH_CSR_TLBELO0
+       csrwr           t1, LOONGARCH_CSR_TLBELO1
        tlbwr
-leave_load:
-       csrrd   t0, EXCEPTION_KS0
-       csrrd   t1, EXCEPTION_KS1
-       csrrd   ra, EXCEPTION_KS2
+
+       csrrd           t0, EXCEPTION_KS0
+       csrrd           t1, EXCEPTION_KS1
+       csrrd           ra, EXCEPTION_KS2
        ertn
+
 #ifdef CONFIG_64BIT
 vmalloc_load:
-       la.abs  t1, swapper_pg_dir
-       b       vmalloc_done_load
+       la.abs          t1, swapper_pg_dir
+       b               vmalloc_done_load
 #endif
 
-       /*
-        * This is the entry point when build_tlbchange_handler_head
-        * spots a huge page.
-        */
+       /* This is the entry point of a huge page. */
 tlb_huge_update_load:
 #ifdef CONFIG_SMP
-       ll.d    t0, t1, 0
-#else
-       ld.d    t0, t1, 0
+       ll.d            ra, t1, 0
 #endif
-       srli.d  ra, t0, _PAGE_PRESENT_SHIFT
-       andi    ra, ra, 1
-       beqz    ra, nopage_tlb_load
-       tlbsrch
+       andi            t0, ra, _PAGE_PRESENT
+       beqz            t0, nopage_tlb_load
 
-       ori     t0, t0, _PAGE_VALID
 #ifdef CONFIG_SMP
-       sc.d    t0, t1, 0
-       beqz    t0, tlb_huge_update_load
-       ld.d    t0, t1, 0
+       ori             t0, ra, _PAGE_VALID
+       sc.d            t0, t1, 0
+       beqz            t0, tlb_huge_update_load
+       ori             t0, ra, _PAGE_VALID
 #else
-       st.d    t0, t1, 0
+       rotri.d         ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
+       ori             t0, ra, _PAGE_VALID
+       st.d            t0, t1, 0
 #endif
+       tlbsrch
        addu16i.d       t1, zero, -(CSR_TLBIDX_EHINV >> 16)
        addi.d          ra, t1, 0
        csrxchg         ra, t1, LOONGARCH_CSR_TLBIDX
        tlbwr
 
-       csrxchg zero, t1, LOONGARCH_CSR_TLBIDX
+       csrxchg         zero, t1, LOONGARCH_CSR_TLBIDX
 
        /*
         * A huge PTE describes an area the size of the
@@ -167,21 +154,20 @@ tlb_huge_update_load:
         * address space.
         */
        /* Huge page: Move Global bit */
-       xori    t0, t0, _PAGE_HUGE
-       lu12i.w t1, _PAGE_HGLOBAL >> 12
-       and     t1, t0, t1
-       srli.d  t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT)
-       or      t0, t0, t1
+       xori            t0, t0, _PAGE_HUGE
+       lu12i.w         t1, _PAGE_HGLOBAL >> 12
+       and             t1, t0, t1
+       srli.d          t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT)
+       or              t0, t0, t1
 
-       addi.d  ra, t0, 0
-       csrwr   t0, LOONGARCH_CSR_TLBELO0
-       addi.d  t0, ra, 0
+       move            ra, t0
+       csrwr           ra, LOONGARCH_CSR_TLBELO0
 
        /* Convert to entrylo1 */
-       addi.d  t1, zero, 1
-       slli.d  t1, t1, (HPAGE_SHIFT - 1)
-       add.d   t0, t0, t1
-       csrwr   t0, LOONGARCH_CSR_TLBELO1
+       addi.d          t1, zero, 1
+       slli.d          t1, t1, (HPAGE_SHIFT - 1)
+       add.d           t0, t0, t1
+       csrwr           t0, LOONGARCH_CSR_TLBELO1
 
        /* Set huge page tlb entry size */
        addu16i.d       t0, zero, (CSR_TLBIDX_PS >> 16)
@@ -194,136 +180,120 @@ tlb_huge_update_load:
        addu16i.d       t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
        csrxchg         t1, t0, LOONGARCH_CSR_TLBIDX
 
+       csrrd           t0, EXCEPTION_KS0
+       csrrd           t1, EXCEPTION_KS1
+       csrrd           ra, EXCEPTION_KS2
+       ertn
+
 nopage_tlb_load:
-       dbar    0
-       csrrd   ra, EXCEPTION_KS2
-       la.abs  t0, tlb_do_page_fault_0
-       jr      t0
+       dbar            0
+       csrrd           ra, EXCEPTION_KS2
+       la.abs          t0, tlb_do_page_fault_0
+       jr              t0
 SYM_FUNC_END(handle_tlb_load)
 
 SYM_FUNC_START(handle_tlb_store)
-       csrwr   t0, EXCEPTION_KS0
-       csrwr   t1, EXCEPTION_KS1
-       csrwr   ra, EXCEPTION_KS2
+       csrwr           t0, EXCEPTION_KS0
+       csrwr           t1, EXCEPTION_KS1
+       csrwr           ra, EXCEPTION_KS2
 
        /*
         * The vmalloc handling is not in the hotpath.
         */
-       csrrd   t0, LOONGARCH_CSR_BADV
-       bltz    t0, vmalloc_store
-       csrrd   t1, LOONGARCH_CSR_PGDL
+       csrrd           t0, LOONGARCH_CSR_BADV
+       bltz            t0, vmalloc_store
+       csrrd           t1, LOONGARCH_CSR_PGDL
 
 vmalloc_done_store:
        /* Get PGD offset in bytes */
-       srli.d  t0, t0, PGDIR_SHIFT
-       andi    t0, t0, (PTRS_PER_PGD - 1)
-       slli.d  t0, t0, 3
-       add.d   t1, t1, t0
-
+       bstrpick.d      ra, t0, PTRS_PER_PGD_BITS + PGDIR_SHIFT - 1, PGDIR_SHIFT
+       alsl.d          t1, ra, t1, 3
 #if CONFIG_PGTABLE_LEVELS > 3
-       csrrd   t0, LOONGARCH_CSR_BADV
-       ld.d    t1, t1, 0
-       srli.d  t0, t0, PUD_SHIFT
-       andi    t0, t0, (PTRS_PER_PUD - 1)
-       slli.d  t0, t0, 3
-       add.d   t1, t1, t0
+       ld.d            t1, t1, 0
+       bstrpick.d      ra, t0, PTRS_PER_PUD_BITS + PUD_SHIFT - 1, PUD_SHIFT
+       alsl.d          t1, ra, t1, 3
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
-       csrrd   t0, LOONGARCH_CSR_BADV
-       ld.d    t1, t1, 0
-       srli.d  t0, t0, PMD_SHIFT
-       andi    t0, t0, (PTRS_PER_PMD - 1)
-       slli.d  t0, t0, 3
-       add.d   t1, t1, t0
+       ld.d            t1, t1, 0
+       bstrpick.d      ra, t0, PTRS_PER_PMD_BITS + PMD_SHIFT - 1, PMD_SHIFT
+       alsl.d          t1, ra, t1, 3
 #endif
-       ld.d    ra, t1, 0
+       ld.d            ra, t1, 0
 
        /*
         * For huge tlb entries, pmde doesn't contain an address but
         * instead contains the tlb pte. Check the PAGE_HUGE bit and
         * see if we need to jump to huge tlb processing.
         */
-       andi    t0, ra, _PAGE_HUGE
-       bnez    t0, tlb_huge_update_store
+       rotri.d         ra, ra, _PAGE_HUGE_SHIFT + 1
+       bltz            ra, tlb_huge_update_store
 
-       csrrd   t0, LOONGARCH_CSR_BADV
-       srli.d  t0, t0, PAGE_SHIFT
-       andi    t0, t0, (PTRS_PER_PTE - 1)
-       slli.d  t0, t0, _PTE_T_LOG2
-       add.d   t1, ra, t0
+       rotri.d         ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
+       bstrpick.d      t0, t0, PTRS_PER_PTE_BITS + PAGE_SHIFT - 1, PAGE_SHIFT
+       alsl.d          t1, t0, ra, _PTE_T_LOG2
 
 #ifdef CONFIG_SMP
 smp_pgtable_change_store:
-#endif
-#ifdef CONFIG_SMP
-       ll.d    t0, t1, 0
+       ll.d            t0, t1, 0
 #else
-       ld.d    t0, t1, 0
+       ld.d            t0, t1, 0
 #endif
-       tlbsrch
-
-       srli.d  ra, t0, _PAGE_PRESENT_SHIFT
-       andi    ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
-       xori    ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
-       bnez    ra, nopage_tlb_store
+       andi            ra, t0, _PAGE_PRESENT | _PAGE_WRITE
+       xori            ra, ra, _PAGE_PRESENT | _PAGE_WRITE
+       bnez            ra, nopage_tlb_store
 
-       ori     t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
+       ori             t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
 #ifdef CONFIG_SMP
-       sc.d    t0, t1, 0
-       beqz    t0, smp_pgtable_change_store
+       sc.d            t0, t1, 0
+       beqz            t0, smp_pgtable_change_store
 #else
-       st.d    t0, t1, 0
+       st.d            t0, t1, 0
 #endif
-
-       ori     t1, t1, 8
-       xori    t1, t1, 8
-       ld.d    t0, t1, 0
-       ld.d    t1, t1, 8
-       csrwr   t0, LOONGARCH_CSR_TLBELO0
-       csrwr   t1, LOONGARCH_CSR_TLBELO1
+       tlbsrch
+       bstrins.d       t1, zero, 3, 3
+       ld.d            t0, t1, 0
+       ld.d            t1, t1, 8
+       csrwr           t0, LOONGARCH_CSR_TLBELO0
+       csrwr           t1, LOONGARCH_CSR_TLBELO1
        tlbwr
-leave_store:
-       csrrd   t0, EXCEPTION_KS0
-       csrrd   t1, EXCEPTION_KS1
-       csrrd   ra, EXCEPTION_KS2
+
+       csrrd           t0, EXCEPTION_KS0
+       csrrd           t1, EXCEPTION_KS1
+       csrrd           ra, EXCEPTION_KS2
        ertn
+
 #ifdef CONFIG_64BIT
 vmalloc_store:
-       la.abs  t1, swapper_pg_dir
-       b       vmalloc_done_store
+       la.abs          t1, swapper_pg_dir
+       b               vmalloc_done_store
 #endif
 
-       /*
-        * This is the entry point when build_tlbchange_handler_head
-        * spots a huge page.
-        */
+       /* This is the entry point of a huge page. */
 tlb_huge_update_store:
 #ifdef CONFIG_SMP
-       ll.d    t0, t1, 0
-#else
-       ld.d    t0, t1, 0
+       ll.d            ra, t1, 0
 #endif
-       srli.d  ra, t0, _PAGE_PRESENT_SHIFT
-       andi    ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
-       xori    ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
-       bnez    ra, nopage_tlb_store
-
-       tlbsrch
-       ori     t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
+       andi            t0, ra, _PAGE_PRESENT | _PAGE_WRITE
+       xori            t0, t0, _PAGE_PRESENT | _PAGE_WRITE
+       bnez            t0, nopage_tlb_store
 
 #ifdef CONFIG_SMP
-       sc.d    t0, t1, 0
-       beqz    t0, tlb_huge_update_store
-       ld.d    t0, t1, 0
+       ori             t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
+       sc.d            t0, t1, 0
+       beqz            t0, tlb_huge_update_store
+       ori             t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
 #else
-       st.d    t0, t1, 0
+       rotri.d         ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
+       ori             t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
+       st.d            t0, t1, 0
 #endif
+       tlbsrch
        addu16i.d       t1, zero, -(CSR_TLBIDX_EHINV >> 16)
        addi.d          ra, t1, 0
        csrxchg         ra, t1, LOONGARCH_CSR_TLBIDX
        tlbwr
 
-       csrxchg zero, t1, LOONGARCH_CSR_TLBIDX
+       csrxchg         zero, t1, LOONGARCH_CSR_TLBIDX
        /*
         * A huge PTE describes an area the size of the
         * configured huge page size. This is twice the
@@ -334,21 +304,20 @@ tlb_huge_update_store:
         * address space.
         */
        /* Huge page: Move Global bit */
-       xori    t0, t0, _PAGE_HUGE
-       lu12i.w t1, _PAGE_HGLOBAL >> 12
-       and     t1, t0, t1
-       srli.d  t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT)
-       or      t0, t0, t1
+       xori            t0, t0, _PAGE_HUGE
+       lu12i.w         t1, _PAGE_HGLOBAL >> 12
+       and             t1, t0, t1
+       srli.d          t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT)
+       or              t0, t0, t1
 
-       addi.d  ra, t0, 0
-       csrwr   t0, LOONGARCH_CSR_TLBELO0
-       addi.d  t0, ra, 0
+       move            ra, t0
+       csrwr           ra, LOONGARCH_CSR_TLBELO0
 
        /* Convert to entrylo1 */
-       addi.d  t1, zero, 1
-       slli.d  t1, t1, (HPAGE_SHIFT - 1)
-       add.d   t0, t0, t1
-       csrwr   t0, LOONGARCH_CSR_TLBELO1
+       addi.d          t1, zero, 1
+       slli.d          t1, t1, (HPAGE_SHIFT - 1)
+       add.d           t0, t0, t1
+       csrwr           t0, LOONGARCH_CSR_TLBELO1
 
        /* Set huge page tlb entry size */
        addu16i.d       t0, zero, (CSR_TLBIDX_PS >> 16)
@@ -362,126 +331,110 @@ tlb_huge_update_store:
        addu16i.d       t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
        csrxchg         t1, t0, LOONGARCH_CSR_TLBIDX
 
+       csrrd           t0, EXCEPTION_KS0
+       csrrd           t1, EXCEPTION_KS1
+       csrrd           ra, EXCEPTION_KS2
+       ertn
+
 nopage_tlb_store:
-       dbar    0
-       csrrd   ra, EXCEPTION_KS2
-       la.abs  t0, tlb_do_page_fault_1
-       jr      t0
+       dbar            0
+       csrrd           ra, EXCEPTION_KS2
+       la.abs          t0, tlb_do_page_fault_1
+       jr              t0
 SYM_FUNC_END(handle_tlb_store)
 
 SYM_FUNC_START(handle_tlb_modify)
-       csrwr   t0, EXCEPTION_KS0
-       csrwr   t1, EXCEPTION_KS1
-       csrwr   ra, EXCEPTION_KS2
+       csrwr           t0, EXCEPTION_KS0
+       csrwr           t1, EXCEPTION_KS1
+       csrwr           ra, EXCEPTION_KS2
 
        /*
         * The vmalloc handling is not in the hotpath.
         */
-       csrrd   t0, LOONGARCH_CSR_BADV
-       bltz    t0, vmalloc_modify
-       csrrd   t1, LOONGARCH_CSR_PGDL
+       csrrd           t0, LOONGARCH_CSR_BADV
+       bltz            t0, vmalloc_modify
+       csrrd           t1, LOONGARCH_CSR_PGDL
 
 vmalloc_done_modify:
        /* Get PGD offset in bytes */
-       srli.d  t0, t0, PGDIR_SHIFT
-       andi    t0, t0, (PTRS_PER_PGD - 1)
-       slli.d  t0, t0, 3
-       add.d   t1, t1, t0
+       bstrpick.d      ra, t0, PTRS_PER_PGD_BITS + PGDIR_SHIFT - 1, PGDIR_SHIFT
+       alsl.d          t1, ra, t1, 3
 #if CONFIG_PGTABLE_LEVELS > 3
-       csrrd   t0, LOONGARCH_CSR_BADV
-       ld.d    t1, t1, 0
-       srli.d  t0, t0, PUD_SHIFT
-       andi    t0, t0, (PTRS_PER_PUD - 1)
-       slli.d  t0, t0, 3
-       add.d   t1, t1, t0
+       ld.d            t1, t1, 0
+       bstrpick.d      ra, t0, PTRS_PER_PUD_BITS + PUD_SHIFT - 1, PUD_SHIFT
+       alsl.d          t1, ra, t1, 3
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
-       csrrd   t0, LOONGARCH_CSR_BADV
-       ld.d    t1, t1, 0
-       srli.d  t0, t0, PMD_SHIFT
-       andi    t0, t0, (PTRS_PER_PMD - 1)
-       slli.d  t0, t0, 3
-       add.d   t1, t1, t0
+       ld.d            t1, t1, 0
+       bstrpick.d      ra, t0, PTRS_PER_PMD_BITS + PMD_SHIFT - 1, PMD_SHIFT
+       alsl.d          t1, ra, t1, 3
 #endif
-       ld.d    ra, t1, 0
+       ld.d            ra, t1, 0
 
        /*
         * For huge tlb entries, pmde doesn't contain an address but
         * instead contains the tlb pte. Check the PAGE_HUGE bit and
         * see if we need to jump to huge tlb processing.
         */
-       andi    t0, ra, _PAGE_HUGE
-       bnez    t0, tlb_huge_update_modify
+       rotri.d         ra, ra, _PAGE_HUGE_SHIFT + 1
+       bltz            ra, tlb_huge_update_modify
 
-       csrrd   t0, LOONGARCH_CSR_BADV
-       srli.d  t0, t0, PAGE_SHIFT
-       andi    t0, t0, (PTRS_PER_PTE - 1)
-       slli.d  t0, t0, _PTE_T_LOG2
-       add.d   t1, ra, t0
+       rotri.d         ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
+       bstrpick.d      t0, t0, PTRS_PER_PTE_BITS + PAGE_SHIFT - 1, PAGE_SHIFT
+       alsl.d          t1, t0, ra, _PTE_T_LOG2
 
 #ifdef CONFIG_SMP
 smp_pgtable_change_modify:
-#endif
-#ifdef CONFIG_SMP
-       ll.d    t0, t1, 0
+       ll.d            t0, t1, 0
 #else
-       ld.d    t0, t1, 0
+       ld.d            t0, t1, 0
 #endif
-       tlbsrch
-
-       srli.d  ra, t0, _PAGE_WRITE_SHIFT
-       andi    ra, ra, 1
-       beqz    ra, nopage_tlb_modify
+       andi            ra, t0, _PAGE_WRITE
+       beqz            ra, nopage_tlb_modify
 
-       ori     t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
+       ori             t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
 #ifdef CONFIG_SMP
-       sc.d    t0, t1, 0
-       beqz    t0, smp_pgtable_change_modify
+       sc.d            t0, t1, 0
+       beqz            t0, smp_pgtable_change_modify
 #else
-       st.d    t0, t1, 0
+       st.d            t0, t1, 0
 #endif
-       ori     t1, t1, 8
-       xori    t1, t1, 8
-       ld.d    t0, t1, 0
-       ld.d    t1, t1, 8
-       csrwr   t0, LOONGARCH_CSR_TLBELO0
-       csrwr   t1, LOONGARCH_CSR_TLBELO1
+       tlbsrch
+       bstrins.d       t1, zero, 3, 3
+       ld.d            t0, t1, 0
+       ld.d            t1, t1, 8
+       csrwr           t0, LOONGARCH_CSR_TLBELO0
+       csrwr           t1, LOONGARCH_CSR_TLBELO1
        tlbwr
-leave_modify:
-       csrrd   t0, EXCEPTION_KS0
-       csrrd   t1, EXCEPTION_KS1
-       csrrd   ra, EXCEPTION_KS2
+
+       csrrd           t0, EXCEPTION_KS0
+       csrrd           t1, EXCEPTION_KS1
+       csrrd           ra, EXCEPTION_KS2
        ertn
+
 #ifdef CONFIG_64BIT
 vmalloc_modify:
-       la.abs  t1, swapper_pg_dir
-       b       vmalloc_done_modify
+       la.abs          t1, swapper_pg_dir
+       b               vmalloc_done_modify
 #endif
 
-       /*
-        * This is the entry point when
-        * build_tlbchange_handler_head spots a huge page.
-        */
+       /* This is the entry point of a huge page. */
 tlb_huge_update_modify:
 #ifdef CONFIG_SMP
-       ll.d    t0, t1, 0
-#else
-       ld.d    t0, t1, 0
+       ll.d            ra, t1, 0
 #endif
-
-       srli.d  ra, t0, _PAGE_WRITE_SHIFT
-       andi    ra, ra, 1
-       beqz    ra, nopage_tlb_modify
-
-       tlbsrch
-       ori     t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
+       andi            t0, ra, _PAGE_WRITE
+       beqz            t0, nopage_tlb_modify
 
 #ifdef CONFIG_SMP
-       sc.d    t0, t1, 0
-       beqz    t0, tlb_huge_update_modify
-       ld.d    t0, t1, 0
+       ori             t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
+       sc.d            t0, t1, 0
+       beqz            t0, tlb_huge_update_modify
+       ori             t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
 #else
-       st.d    t0, t1, 0
+       rotri.d         ra, ra, 64 - (_PAGE_HUGE_SHIFT + 1)
+       ori             t0, ra, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
+       st.d            t0, t1, 0
 #endif
        /*
         * A huge PTE describes an area the size of the
@@ -493,21 +446,20 @@ tlb_huge_update_modify:
         * address space.
         */
        /* Huge page: Move Global bit */
-       xori    t0, t0, _PAGE_HUGE
-       lu12i.w t1, _PAGE_HGLOBAL >> 12
-       and     t1, t0, t1
-       srli.d  t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT)
-       or      t0, t0, t1
+       xori            t0, t0, _PAGE_HUGE
+       lu12i.w         t1, _PAGE_HGLOBAL >> 12
+       and             t1, t0, t1
+       srli.d          t1, t1, (_PAGE_HGLOBAL_SHIFT - _PAGE_GLOBAL_SHIFT)
+       or              t0, t0, t1
 
-       addi.d  ra, t0, 0
-       csrwr   t0, LOONGARCH_CSR_TLBELO0
-       addi.d  t0, ra, 0
+       move            ra, t0
+       csrwr           ra, LOONGARCH_CSR_TLBELO0
 
        /* Convert to entrylo1 */
-       addi.d  t1, zero, 1
-       slli.d  t1, t1, (HPAGE_SHIFT - 1)
-       add.d   t0, t0, t1
-       csrwr   t0, LOONGARCH_CSR_TLBELO1
+       addi.d          t1, zero, 1
+       slli.d          t1, t1, (HPAGE_SHIFT - 1)
+       add.d           t0, t0, t1
+       csrwr           t0, LOONGARCH_CSR_TLBELO1
 
        /* Set huge page tlb entry size */
        addu16i.d       t0, zero, (CSR_TLBIDX_PS >> 16)
@@ -521,26 +473,31 @@ tlb_huge_update_modify:
        addu16i.d       t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
        csrxchg         t1, t0, LOONGARCH_CSR_TLBIDX
 
+       csrrd           t0, EXCEPTION_KS0
+       csrrd           t1, EXCEPTION_KS1
+       csrrd           ra, EXCEPTION_KS2
+       ertn
+
 nopage_tlb_modify:
-       dbar    0
-       csrrd   ra, EXCEPTION_KS2
-       la.abs  t0, tlb_do_page_fault_1
-       jr      t0
+       dbar            0
+       csrrd           ra, EXCEPTION_KS2
+       la.abs          t0, tlb_do_page_fault_1
+       jr              t0
 SYM_FUNC_END(handle_tlb_modify)
 
 SYM_FUNC_START(handle_tlb_refill)
-       csrwr   t0, LOONGARCH_CSR_TLBRSAVE
-       csrrd   t0, LOONGARCH_CSR_PGD
-       lddir   t0, t0, 3
+       csrwr           t0, LOONGARCH_CSR_TLBRSAVE
+       csrrd           t0, LOONGARCH_CSR_PGD
+       lddir           t0, t0, 3
 #if CONFIG_PGTABLE_LEVELS > 3
-       lddir   t0, t0, 2
+       lddir           t0, t0, 2
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
-       lddir   t0, t0, 1
+       lddir           t0, t0, 1
 #endif
-       ldpte   t0, 0
-       ldpte   t0, 1
+       ldpte           t0, 0
+       ldpte           t0, 1
        tlbfill
-       csrrd   t0, LOONGARCH_CSR_TLBRSAVE
+       csrrd           t0, LOONGARCH_CSR_TLBRSAVE
        ertn
 SYM_FUNC_END(handle_tlb_refill)