; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck -check-prefixes=CHECK,SM90 %s
; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK,SM100
; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}

; This test is based on load-store-vectors.ll,
; and contains testing for lowering 256-bit vector loads/stores

; Types we are checking: i8, i16, half, bfloat, i32, i64, f32, f64

; Address spaces we are checking: generic, global, shared, local
; - Global is the only address space that currently supports 256-bit/v8 loads/stores,
;   the other cases will legalize by splitting to smaller vectors

; 256-bit vector loads/stores are only legal for blackwell+, so on sm_90, the vectors will be split

; Types of loads/stores we are checking: normal, volatile
; - No need to check atomic loads/stores (monotonic and unordered) like load-store-vectors.ll checks,
;   because those only work with non-vectors

;; generic statespace

; generic

define void @generic_32xi8(ptr %a, ptr %b) {
; CHECK-LABEL: generic_32xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_32xi8_param_0];
; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [generic_32xi8_param_1];
; CHECK-NEXT:    st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <32 x i8>, ptr %a
  store <32 x i8> %a.load, ptr %b
  ret void
}

define void @generic_16xi16(ptr %a, ptr %b) {
; CHECK-LABEL: generic_16xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_16xi16_param_0];
; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [generic_16xi16_param_1];
; CHECK-NEXT:    st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <16 x i16>, ptr %a
  store <16 x i16> %a.load, ptr %b
  ret void
}

define void @generic_16xhalf(ptr %a, ptr %b) {
; CHECK-LABEL: generic_16xhalf(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_16xhalf_param_0];
; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [generic_16xhalf_param_1];
; CHECK-NEXT:    st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <16 x half>, ptr %a
  store <16 x half> %a.load, ptr %b
  ret void
}

define void @generic_16xbfloat(ptr %a, ptr %b) {
; CHECK-LABEL: generic_16xbfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_16xbfloat_param_0];
; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [generic_16xbfloat_param_1];
; CHECK-NEXT:    st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <16 x bfloat>, ptr %a
  store <16 x bfloat> %a.load, ptr %b
  ret void
}

define void @generic_8xi32(ptr %a, ptr %b) {
; CHECK-LABEL: generic_8xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_8xi32_param_0];
; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [generic_8xi32_param_1];
; CHECK-NEXT:    st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <8 x i32>, ptr %a
  store <8 x i32> %a.load, ptr %b
  ret void
}

define void @generic_4xi64(ptr %a, ptr %b) {
; CHECK-LABEL: generic_4xi64(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_4xi64_param_0];
; CHECK-NEXT:    ld.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [generic_4xi64_param_1];
; CHECK-NEXT:    st.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load <4 x i64>, ptr %a
  store <4 x i64> %a.load, ptr %b
  ret void
}

define void @generic_8xfloat(ptr %a, ptr %b) {
; CHECK-LABEL: generic_8xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_8xfloat_param_0];
; CHECK-NEXT:    ld.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [generic_8xfloat_param_1];
; CHECK-NEXT:    st.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load <8 x float>, ptr %a
  store <8 x float> %a.load, ptr %b
  ret void
}

define void @generic_4xdouble(ptr %a, ptr %b) {
; CHECK-LABEL: generic_4xdouble(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_4xdouble_param_0];
; CHECK-NEXT:    ld.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [generic_4xdouble_param_1];
; CHECK-NEXT:    st.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load <4 x double>, ptr %a
  store <4 x double> %a.load, ptr %b
  ret void
}

; generic_volatile

define void @generic_volatile_32xi8(ptr %a, ptr %b) {
; CHECK-LABEL: generic_volatile_32xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_32xi8_param_0];
; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [generic_volatile_32xi8_param_1];
; CHECK-NEXT:    st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <32 x i8>, ptr %a
  store volatile <32 x i8> %a.load, ptr %b
  ret void
}

define void @generic_volatile_16xi16(ptr %a, ptr %b) {
; CHECK-LABEL: generic_volatile_16xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_16xi16_param_0];
; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [generic_volatile_16xi16_param_1];
; CHECK-NEXT:    st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <16 x i16>, ptr %a
  store volatile <16 x i16> %a.load, ptr %b
  ret void
}

define void @generic_volatile_16xhalf(ptr %a, ptr %b) {
; CHECK-LABEL: generic_volatile_16xhalf(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_16xhalf_param_0];
; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [generic_volatile_16xhalf_param_1];
; CHECK-NEXT:    st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <16 x half>, ptr %a
  store volatile <16 x half> %a.load, ptr %b
  ret void
}

define void @generic_volatile_16xbfloat(ptr %a, ptr %b) {
; CHECK-LABEL: generic_volatile_16xbfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_16xbfloat_param_0];
; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [generic_volatile_16xbfloat_param_1];
; CHECK-NEXT:    st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <16 x bfloat>, ptr %a
  store volatile <16 x bfloat> %a.load, ptr %b
  ret void
}

define void @generic_volatile_8xi32(ptr %a, ptr %b) {
; CHECK-LABEL: generic_volatile_8xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_8xi32_param_0];
; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [generic_volatile_8xi32_param_1];
; CHECK-NEXT:    st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <8 x i32>, ptr %a
  store volatile <8 x i32> %a.load, ptr %b
  ret void
}

define void @generic_volatile_4xi64(ptr %a, ptr %b) {
; CHECK-LABEL: generic_volatile_4xi64(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_4xi64_param_0];
; CHECK-NEXT:    ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [generic_volatile_4xi64_param_1];
; CHECK-NEXT:    st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.volatile.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i64>, ptr %a
  store volatile <4 x i64> %a.load, ptr %b
  ret void
}

define void @generic_volatile_8xfloat(ptr %a, ptr %b) {
; CHECK-LABEL: generic_volatile_8xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0];
; CHECK-NEXT:    ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [generic_volatile_8xfloat_param_1];
; CHECK-NEXT:    st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.volatile.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <8 x float>, ptr %a
  store volatile <8 x float> %a.load, ptr %b
  ret void
}

define void @generic_volatile_4xdouble(ptr %a, ptr %b) {
; CHECK-LABEL: generic_volatile_4xdouble(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_4xdouble_param_0];
; CHECK-NEXT:    ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [generic_volatile_4xdouble_param_1];
; CHECK-NEXT:    st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.volatile.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x double>, ptr %a
  store volatile <4 x double> %a.load, ptr %b
  ret void
}

;; global statespace

; global

define void @global_32xi8(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_32xi8(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<9>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_32xi8_param_0];
; SM90-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT:    ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd2, [global_32xi8_param_1];
; SM90-NEXT:    st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT:    st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_32xi8(
; SM100:       {
; SM100-NEXT:    .reg .b32 %r<9>;
; SM100-NEXT:    .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_32xi8_param_0];
; SM100-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd2, [global_32xi8_param_1];
; SM100-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT:    ret;
  %a.load = load <32 x i8>, ptr addrspace(1) %a
  store <32 x i8> %a.load, ptr addrspace(1) %b
  ret void
}

define void @global_16xi16(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_16xi16(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<9>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_16xi16_param_0];
; SM90-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT:    ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd2, [global_16xi16_param_1];
; SM90-NEXT:    st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT:    st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_16xi16(
; SM100:       {
; SM100-NEXT:    .reg .b32 %r<9>;
; SM100-NEXT:    .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_16xi16_param_0];
; SM100-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd2, [global_16xi16_param_1];
; SM100-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT:    ret;
  %a.load = load <16 x i16>, ptr addrspace(1) %a
  store <16 x i16> %a.load, ptr addrspace(1) %b
  ret void
}

define void @global_16xhalf(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_16xhalf(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<9>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_16xhalf_param_0];
; SM90-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT:    ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd2, [global_16xhalf_param_1];
; SM90-NEXT:    st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT:    st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_16xhalf(
; SM100:       {
; SM100-NEXT:    .reg .b32 %r<9>;
; SM100-NEXT:    .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_16xhalf_param_0];
; SM100-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd2, [global_16xhalf_param_1];
; SM100-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT:    ret;
  %a.load = load <16 x half>, ptr addrspace(1) %a
  store <16 x half> %a.load, ptr addrspace(1) %b
  ret void
}

define void @global_16xbfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_16xbfloat(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<9>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_16xbfloat_param_0];
; SM90-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT:    ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd2, [global_16xbfloat_param_1];
; SM90-NEXT:    st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT:    st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_16xbfloat(
; SM100:       {
; SM100-NEXT:    .reg .b32 %r<9>;
; SM100-NEXT:    .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_16xbfloat_param_0];
; SM100-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd2, [global_16xbfloat_param_1];
; SM100-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT:    ret;
  %a.load = load <16 x bfloat>, ptr addrspace(1) %a
  store <16 x bfloat> %a.load, ptr addrspace(1) %b
  ret void
}

define void @global_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_8xi32(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<9>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_8xi32_param_0];
; SM90-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT:    ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd2, [global_8xi32_param_1];
; SM90-NEXT:    st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT:    st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_8xi32(
; SM100:       {
; SM100-NEXT:    .reg .b32 %r<9>;
; SM100-NEXT:    .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_8xi32_param_0];
; SM100-NEXT:    ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd2, [global_8xi32_param_1];
; SM100-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT:    ret;
  %a.load = load <8 x i32>, ptr addrspace(1) %a
  store <8 x i32> %a.load, ptr addrspace(1) %b
  ret void
}

define void @global_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_4xi64(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<7>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_4xi64_param_0];
; SM90-NEXT:    ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
; SM90-NEXT:    ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd6, [global_4xi64_param_1];
; SM90-NEXT:    st.global.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM90-NEXT:    st.global.v2.b64 [%rd6], {%rd2, %rd3};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_4xi64(
; SM100:       {
; SM100-NEXT:    .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_4xi64_param_0];
; SM100-NEXT:    ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd6, [global_4xi64_param_1];
; SM100-NEXT:    st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT:    ret;
  %a.load = load <4 x i64>, ptr addrspace(1) %a
  store <4 x i64> %a.load, ptr addrspace(1) %b
  ret void
}

define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_8xfloat(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<7>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_8xfloat_param_0];
; SM90-NEXT:    ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
; SM90-NEXT:    ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd6, [global_8xfloat_param_1];
; SM90-NEXT:    st.global.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM90-NEXT:    st.global.v2.b64 [%rd6], {%rd2, %rd3};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_8xfloat(
; SM100:       {
; SM100-NEXT:    .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_8xfloat_param_0];
; SM100-NEXT:    ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd6, [global_8xfloat_param_1];
; SM100-NEXT:    st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT:    ret;
  %a.load = load <8 x float>, ptr addrspace(1) %a
  store <8 x float> %a.load, ptr addrspace(1) %b
  ret void
}

define void @global_4xdouble(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_4xdouble(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<7>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_4xdouble_param_0];
; SM90-NEXT:    ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
; SM90-NEXT:    ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd6, [global_4xdouble_param_1];
; SM90-NEXT:    st.global.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM90-NEXT:    st.global.v2.b64 [%rd6], {%rd2, %rd3};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_4xdouble(
; SM100:       {
; SM100-NEXT:    .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_4xdouble_param_0];
; SM100-NEXT:    ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd6, [global_4xdouble_param_1];
; SM100-NEXT:    st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT:    ret;
  %a.load = load <4 x double>, ptr addrspace(1) %a
  store <4 x double> %a.load, ptr addrspace(1) %b
  ret void
}

; global_volatile

define void @global_volatile_32xi8(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_32xi8(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<9>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_volatile_32xi8_param_0];
; SM90-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT:    ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd2, [global_volatile_32xi8_param_1];
; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_volatile_32xi8(
; SM100:       {
; SM100-NEXT:    .reg .b32 %r<9>;
; SM100-NEXT:    .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_volatile_32xi8_param_0];
; SM100-NEXT:    ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd2, [global_volatile_32xi8_param_1];
; SM100-NEXT:    st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT:    ret;
  %a.load = load volatile <32 x i8>, ptr addrspace(1) %a
  store volatile <32 x i8> %a.load, ptr addrspace(1) %b
  ret void
}

define void @global_volatile_16xi16(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_16xi16(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<9>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_volatile_16xi16_param_0];
; SM90-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT:    ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd2, [global_volatile_16xi16_param_1];
; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_volatile_16xi16(
; SM100:       {
; SM100-NEXT:    .reg .b32 %r<9>;
; SM100-NEXT:    .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_volatile_16xi16_param_0];
; SM100-NEXT:    ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd2, [global_volatile_16xi16_param_1];
; SM100-NEXT:    st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT:    ret;
  %a.load = load volatile <16 x i16>, ptr addrspace(1) %a
  store volatile <16 x i16> %a.load, ptr addrspace(1) %b
  ret void
}

define void @global_volatile_16xhalf(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_16xhalf(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<9>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_volatile_16xhalf_param_0];
; SM90-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT:    ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd2, [global_volatile_16xhalf_param_1];
; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_volatile_16xhalf(
; SM100:       {
; SM100-NEXT:    .reg .b32 %r<9>;
; SM100-NEXT:    .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_volatile_16xhalf_param_0];
; SM100-NEXT:    ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd2, [global_volatile_16xhalf_param_1];
; SM100-NEXT:    st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT:    ret;
  %a.load = load volatile <16 x half>, ptr addrspace(1) %a
  store volatile <16 x half> %a.load, ptr addrspace(1) %b
  ret void
}

define void @global_volatile_16xbfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_16xbfloat(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<9>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_volatile_16xbfloat_param_0];
; SM90-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT:    ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd2, [global_volatile_16xbfloat_param_1];
; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_volatile_16xbfloat(
; SM100:       {
; SM100-NEXT:    .reg .b32 %r<9>;
; SM100-NEXT:    .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_volatile_16xbfloat_param_0];
; SM100-NEXT:    ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd2, [global_volatile_16xbfloat_param_1];
; SM100-NEXT:    st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT:    ret;
  %a.load = load volatile <16 x bfloat>, ptr addrspace(1) %a
  store volatile <16 x bfloat> %a.load, ptr addrspace(1) %b
  ret void
}

define void @global_volatile_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_8xi32(
; SM90:       {
; SM90-NEXT:    .reg .b32 %r<9>;
; SM90-NEXT:    .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_volatile_8xi32_param_0];
; SM90-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; SM90-NEXT:    ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd2, [global_volatile_8xi32_param_1];
; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; SM90-NEXT:    st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_volatile_8xi32(
; SM100:       {
; SM100-NEXT:    .reg .b32 %r<9>;
; SM100-NEXT:    .reg .b64 %rd<3>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_volatile_8xi32_param_0];
; SM100-NEXT:    ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd2, [global_volatile_8xi32_param_1];
; SM100-NEXT:    st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; SM100-NEXT:    ret;
  %a.load = load volatile <8 x i32>, ptr addrspace(1) %a
  store volatile <8 x i32> %a.load, ptr addrspace(1) %b
  ret void
}

define void @global_volatile_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_4xi64(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<7>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_volatile_4xi64_param_0];
; SM90-NEXT:    ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1];
; SM90-NEXT:    ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd6, [global_volatile_4xi64_param_1];
; SM90-NEXT:    st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM90-NEXT:    st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_volatile_4xi64(
; SM100:       {
; SM100-NEXT:    .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_volatile_4xi64_param_0];
; SM100-NEXT:    ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd6, [global_volatile_4xi64_param_1];
; SM100-NEXT:    st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT:    ret;
  %a.load = load volatile <4 x i64>, ptr addrspace(1) %a
  store volatile <4 x i64> %a.load, ptr addrspace(1) %b
  ret void
}

define void @global_volatile_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_8xfloat(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<7>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_volatile_8xfloat_param_0];
; SM90-NEXT:    ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1];
; SM90-NEXT:    ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd6, [global_volatile_8xfloat_param_1];
; SM90-NEXT:    st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM90-NEXT:    st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_volatile_8xfloat(
; SM100:       {
; SM100-NEXT:    .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_volatile_8xfloat_param_0];
; SM100-NEXT:    ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd6, [global_volatile_8xfloat_param_1];
; SM100-NEXT:    st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT:    ret;
  %a.load = load volatile <8 x float>, ptr addrspace(1) %a
  store volatile <8 x float> %a.load, ptr addrspace(1) %b
  ret void
}

define void @global_volatile_4xdouble(ptr addrspace(1) %a, ptr addrspace(1) %b) {
; SM90-LABEL: global_volatile_4xdouble(
; SM90:       {
; SM90-NEXT:    .reg .b64 %rd<7>;
; SM90-EMPTY:
; SM90-NEXT:  // %bb.0:
; SM90-NEXT:    ld.param.b64 %rd1, [global_volatile_4xdouble_param_0];
; SM90-NEXT:    ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1];
; SM90-NEXT:    ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16];
; SM90-NEXT:    ld.param.b64 %rd6, [global_volatile_4xdouble_param_1];
; SM90-NEXT:    st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5};
; SM90-NEXT:    st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3};
; SM90-NEXT:    ret;
;
; SM100-LABEL: global_volatile_4xdouble(
; SM100:       {
; SM100-NEXT:    .reg .b64 %rd<7>;
; SM100-EMPTY:
; SM100-NEXT:  // %bb.0:
; SM100-NEXT:    ld.param.b64 %rd1, [global_volatile_4xdouble_param_0];
; SM100-NEXT:    ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; SM100-NEXT:    ld.param.b64 %rd6, [global_volatile_4xdouble_param_1];
; SM100-NEXT:    st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; SM100-NEXT:    ret;
  %a.load = load volatile <4 x double>, ptr addrspace(1) %a
  store volatile <4 x double> %a.load, ptr addrspace(1) %b
  ret void
}

;; shared statespace

; shared

define void @shared_32xi8(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_32xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_32xi8_param_0];
; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [shared_32xi8_param_1];
; CHECK-NEXT:    st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <32 x i8>, ptr addrspace(3) %a
  store <32 x i8> %a.load, ptr addrspace(3) %b
  ret void
}

define void @shared_16xi16(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_16xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_16xi16_param_0];
; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [shared_16xi16_param_1];
; CHECK-NEXT:    st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <16 x i16>, ptr addrspace(3) %a
  store <16 x i16> %a.load, ptr addrspace(3) %b
  ret void
}

define void @shared_16xhalf(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_16xhalf(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_16xhalf_param_0];
; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [shared_16xhalf_param_1];
; CHECK-NEXT:    st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <16 x half>, ptr addrspace(3) %a
  store <16 x half> %a.load, ptr addrspace(3) %b
  ret void
}

define void @shared_16xbfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_16xbfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_16xbfloat_param_0];
; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [shared_16xbfloat_param_1];
; CHECK-NEXT:    st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <16 x bfloat>, ptr addrspace(3) %a
  store <16 x bfloat> %a.load, ptr addrspace(3) %b
  ret void
}

define void @shared_8xi32(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_8xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_8xi32_param_0];
; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [shared_8xi32_param_1];
; CHECK-NEXT:    st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <8 x i32>, ptr addrspace(3) %a
  store <8 x i32> %a.load, ptr addrspace(3) %b
  ret void
}

define void @shared_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_4xi64(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_4xi64_param_0];
; CHECK-NEXT:    ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [shared_4xi64_param_1];
; CHECK-NEXT:    st.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.shared.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load <4 x i64>, ptr addrspace(3) %a
  store <4 x i64> %a.load, ptr addrspace(3) %b
  ret void
}

define void @shared_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_8xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_8xfloat_param_0];
; CHECK-NEXT:    ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [shared_8xfloat_param_1];
; CHECK-NEXT:    st.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.shared.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load <8 x float>, ptr addrspace(3) %a
  store <8 x float> %a.load, ptr addrspace(3) %b
  ret void
}

define void @shared_4xdouble(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_4xdouble(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_4xdouble_param_0];
; CHECK-NEXT:    ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [shared_4xdouble_param_1];
; CHECK-NEXT:    st.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.shared.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load <4 x double>, ptr addrspace(3) %a
  store <4 x double> %a.load, ptr addrspace(3) %b
  ret void
}

; shared_volatile

define void @shared_volatile_32xi8(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_volatile_32xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_32xi8_param_0];
; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [shared_volatile_32xi8_param_1];
; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <32 x i8>, ptr addrspace(3) %a
  store volatile <32 x i8> %a.load, ptr addrspace(3) %b
  ret void
}

define void @shared_volatile_16xi16(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_volatile_16xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_16xi16_param_0];
; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [shared_volatile_16xi16_param_1];
; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <16 x i16>, ptr addrspace(3) %a
  store volatile <16 x i16> %a.load, ptr addrspace(3) %b
  ret void
}

define void @shared_volatile_16xhalf(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_volatile_16xhalf(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_16xhalf_param_0];
; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [shared_volatile_16xhalf_param_1];
; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <16 x half>, ptr addrspace(3) %a
  store volatile <16 x half> %a.load, ptr addrspace(3) %b
  ret void
}

define void @shared_volatile_16xbfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_volatile_16xbfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_16xbfloat_param_0];
; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [shared_volatile_16xbfloat_param_1];
; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <16 x bfloat>, ptr addrspace(3) %a
  store volatile <16 x bfloat> %a.load, ptr addrspace(3) %b
  ret void
}

define void @shared_volatile_8xi32(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_volatile_8xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_8xi32_param_0];
; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [shared_volatile_8xi32_param_1];
; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <8 x i32>, ptr addrspace(3) %a
  store volatile <8 x i32> %a.load, ptr addrspace(3) %b
  ret void
}

define void @shared_volatile_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_volatile_4xi64(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_4xi64_param_0];
; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [shared_volatile_4xi64_param_1];
; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i64>, ptr addrspace(3) %a
  store volatile <4 x i64> %a.load, ptr addrspace(3) %b
  ret void
}

define void @shared_volatile_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_volatile_8xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0];
; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [shared_volatile_8xfloat_param_1];
; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <8 x float>, ptr addrspace(3) %a
  store volatile <8 x float> %a.load, ptr addrspace(3) %b
  ret void
}

define void @shared_volatile_4xdouble(ptr addrspace(3) %a, ptr addrspace(3) %b) {
; CHECK-LABEL: shared_volatile_4xdouble(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_4xdouble_param_0];
; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [shared_volatile_4xdouble_param_1];
; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x double>, ptr addrspace(3) %a
  store volatile <4 x double> %a.load, ptr addrspace(3) %b
  ret void
}

;; local statespace

; local

define void @local_32xi8(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_32xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_32xi8_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [local_32xi8_param_1];
; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <32 x i8>, ptr addrspace(5) %a
  store <32 x i8> %a.load, ptr addrspace(5) %b
  ret void
}

define void @local_16xi16(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_16xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_16xi16_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [local_16xi16_param_1];
; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <16 x i16>, ptr addrspace(5) %a
  store <16 x i16> %a.load, ptr addrspace(5) %b
  ret void
}

define void @local_16xhalf(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_16xhalf(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_16xhalf_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [local_16xhalf_param_1];
; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <16 x half>, ptr addrspace(5) %a
  store <16 x half> %a.load, ptr addrspace(5) %b
  ret void
}

define void @local_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_16xbfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_16xbfloat_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [local_16xbfloat_param_1];
; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <16 x bfloat>, ptr addrspace(5) %a
  store <16 x bfloat> %a.load, ptr addrspace(5) %b
  ret void
}

define void @local_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_8xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_8xi32_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [local_8xi32_param_1];
; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load <8 x i32>, ptr addrspace(5) %a
  store <8 x i32> %a.load, ptr addrspace(5) %b
  ret void
}

define void @local_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_4xi64(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_4xi64_param_0];
; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [local_4xi64_param_1];
; CHECK-NEXT:    st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.local.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load <4 x i64>, ptr addrspace(5) %a
  store <4 x i64> %a.load, ptr addrspace(5) %b
  ret void
}

define void @local_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_8xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_8xfloat_param_0];
; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [local_8xfloat_param_1];
; CHECK-NEXT:    st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.local.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load <8 x float>, ptr addrspace(5) %a
  store <8 x float> %a.load, ptr addrspace(5) %b
  ret void
}

define void @local_4xdouble(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_4xdouble(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_4xdouble_param_0];
; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [local_4xdouble_param_1];
; CHECK-NEXT:    st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.local.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load <4 x double>, ptr addrspace(5) %a
  store <4 x double> %a.load, ptr addrspace(5) %b
  ret void
}

; local_volatile

define void @local_volatile_32xi8(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_volatile_32xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_32xi8_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [local_volatile_32xi8_param_1];
; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <32 x i8>, ptr addrspace(5) %a
  store volatile <32 x i8> %a.load, ptr addrspace(5) %b
  ret void
}

define void @local_volatile_16xi16(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_volatile_16xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_16xi16_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [local_volatile_16xi16_param_1];
; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <16 x i16>, ptr addrspace(5) %a
  store volatile <16 x i16> %a.load, ptr addrspace(5) %b
  ret void
}

define void @local_volatile_16xhalf(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_volatile_16xhalf(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_16xhalf_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [local_volatile_16xhalf_param_1];
; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <16 x half>, ptr addrspace(5) %a
  store volatile <16 x half> %a.load, ptr addrspace(5) %b
  ret void
}

define void @local_volatile_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_volatile_16xbfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_16xbfloat_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [local_volatile_16xbfloat_param_1];
; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <16 x bfloat>, ptr addrspace(5) %a
  store volatile <16 x bfloat> %a.load, ptr addrspace(5) %b
  ret void
}

define void @local_volatile_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_volatile_8xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_8xi32_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd2, [local_volatile_8xi32_param_1];
; CHECK-NEXT:    st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT:    st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <8 x i32>, ptr addrspace(5) %a
  store volatile <8 x i32> %a.load, ptr addrspace(5) %b
  ret void
}

define void @local_volatile_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_volatile_4xi64(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_4xi64_param_0];
; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [local_volatile_4xi64_param_1];
; CHECK-NEXT:    st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.local.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i64>, ptr addrspace(5) %a
  store volatile <4 x i64> %a.load, ptr addrspace(5) %b
  ret void
}

define void @local_volatile_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_volatile_8xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_8xfloat_param_0];
; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [local_volatile_8xfloat_param_1];
; CHECK-NEXT:    st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.local.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <8 x float>, ptr addrspace(5) %a
  store volatile <8 x float> %a.load, ptr addrspace(5) %b
  ret void
}

define void @local_volatile_4xdouble(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-LABEL: local_volatile_4xdouble(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_4xdouble_param_0];
; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT:    ld.param.b64 %rd6, [local_volatile_4xdouble_param_1];
; CHECK-NEXT:    st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
; CHECK-NEXT:    st.local.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x double>, ptr addrspace(5) %a
  store volatile <4 x double> %a.load, ptr addrspace(5) %b
  ret void
}
