<output id="qn6qe"></output>

    1. <output id="qn6qe"><tt id="qn6qe"></tt></output>
    2. <strike id="qn6qe"></strike>

      亚洲 日本 欧洲 欧美 视频,日韩中文字幕有码av,一本一道av中文字幕无码,国产线播放免费人成视频播放,人妻少妇偷人无码视频,日夜啪啪一区二区三区,国产尤物精品自在拍视频首页,久热这里只有精品12

      高性能計算-TensorCore-wmma-hgemm

      1. TensorCore 簡介:

      • 硬件層面支持半精度浮點矩陣乘法,與昇騰NPU的 cube 核類似,最小只能計算規定尺寸的矩陣乘法。
      • wmma API 封裝在 nvcuda 命名空間

      2. naive :

      • 一個block 1 個warp,wmmaTile 16*16
      點擊查看代碼
      //naive 一個block 一個warp,一個線程處理一個數據
      template <const uint32_t WMMA_M=16,const uint32_t WMMA_N=16,const uint32_t WMMA_K=16>
      __global__ void hgemm_wmma_m16n16k16_naive_kernel(half *A,half *B,half *C, int M,int N,int K)
      {
          const uint32_t aStartRow = blockIdx.y * WMMA_M;
          const uint32_t bStartCol = blockIdx.x * WMMA_N;
          if(aStartRow >= M || bStartCol >= N)
              return;
      
          //定義 CFrag 
          wmma::fragment<wmma::accumulator,WMMA_M,WMMA_N,WMMA_K,half> cFrag;
          wmma::fill_fragment(cFrag,0.0);
      
          //定義 AFrag BFrag
          wmma::fragment<wmma::matrix_a,WMMA_M,WMMA_N,WMMA_K,half,wmma::row_major> aFrag;
          wmma::fragment<wmma::matrix_b,WMMA_M,WMMA_N,WMMA_K,half,wmma::row_major> bFrag;
      
          //K 方向循環
          #pragma unroll
          for(int k=0;k<div_ceil(K,WMMA_K);k++)
          {
              //加載數據 主序參數 K N
              wmma::load_matrix_sync(aFrag, A + aStartRow*K + k*WMMA_K,K);
              wmma::load_matrix_sync(bFrag, B + k*WMMA_K*N + bStartCol,N);
      
              //計算 cFrag 支持原地操作
              wmma::mma_sync(cFrag,aFrag,bFrag,cFrag);
          }
          //回寫結果 主序參數 N
          wmma::store_matrix_sync(C + aStartRow*N + bStartCol,cFrag,N,wmma::mem_row_major);
      
      }
      
      

      3. 優化一:

      • 一個block 有 4 * 2 個warp
        image
      點擊查看代碼
      //優化一:共享內存 + warpOfBlocksize(2,4): 一個block 有更多 warp,K 方向步長 WMMA_K
      template<const uint32_t WMMA_M=16,const uint32_t WMMA_N=16,const uint32_t WMMA_K=16,const uint32_t BXNum=2,const uint32_t BYNum=4>
      __global__ void hgemm_wmma_m16n16k16_block2x4_kernel(half *A,half *B,half *C, int M,int N,int K)
      {
          //block內id
          uint32_t bid = threadIdx.y * blockDim.x + threadIdx.x;
      
          //線程計算結果歸屬于哪個 warpOfblock
          //warp shape (2,4) x 方向維度為2,y 方向維度為4
          /*
          warp0 | warp1
          warp2 | warp3
          warp4 | warp6
          warp6 | warp7
          */
          uint32_t warpY = bid/(32*BXNum);
          uint32_t warpX = (bid/32)%2;
      
          // 共享內存,存放 K 方向一個步長內需要的數據
          __shared__ half sharedA[WMMA_M*BYNum][WMMA_K];
          __shared__ half sharedB[WMMA_K][WMMA_N*BXNum];
      
          //每個線程取數據個數
          //A 
          uint32_t nFetchANum = WMMA_M*BYNum*WMMA_K/(blockDim.y*blockDim.x); //4 half4/float2
          //B
          uint32_t nFetchBNum = WMMA_K*WMMA_N*BXNum/(blockDim.y*blockDim.x); //2 half2
      
          //計算一個步長內取數據到共享內存的線程坐標
          //A 一行需要 4個線程 
          uint32_t threadsPerRowA = WMMA_K/nFetchANum;
          //在一個步長內取A數據的線程排布的坐標
          uint32_t threadRowA = bid/threadsPerRowA;
          uint32_t threadColA = bid%threadsPerRowA;
          
          //B 一行需要 16個線程
          uint32_t threadsPerRowB = WMMA_N*BXNum/nFetchBNum;
          //在一個步長內取B數據的線程排布的坐標
          uint32_t threadRowB = bid/threadsPerRowB;
          uint32_t threadColB = bid%threadsPerRowB;
          
          //定義wmma 計算對象
          wmma::fragment<wmma::accumulator,WMMA_M,WMMA_N,WMMA_K,half> CFrag;
          wmma::fill_fragment(CFrag,0.0);
      
          wmma::fragment<wmma::matrix_a,WMMA_M,WMMA_N,WMMA_K,half,wmma::row_major> AFrag;
          wmma::fragment<wmma::matrix_b,WMMA_M,WMMA_N,WMMA_K,half,wmma::row_major> BFrag;
      
          // #pragma unroll
          for(int k=0;k<K;k+=WMMA_K)
          {
              //取數據到共享內存
              //易錯點 blockSize(32,8)
              CAST_HALF4(&sharedA[threadRowA][threadColA*nFetchANum])[0] = CAST_HALF4(A + (blockIdx.y*WMMA_M*BYNum+threadRowA)*K + (k+threadColA*nFetchANum))[0];
              CAST_HALF2(&sharedB[threadRowB][threadColB*nFetchBNum])[0] = CAST_HALF2(B + (k+threadRowB)*N + blockIdx.x*WMMA_N*BXNum + threadColB*nFetchBNum)[0];
              __syncthreads();
      
              //wmma 計算;ldm 是指共享內存的主序參數
              wmma::load_matrix_sync(AFrag,&sharedA[warpY*WMMA_M][0],WMMA_K);
              wmma::load_matrix_sync(BFrag,&sharedB[0][warpX*WMMA_N],WMMA_N*BXNum);
      
              wmma::mma_sync(CFrag,AFrag,BFrag,CFrag);
      
              __syncthreads();
          }
      
          //儲存結果
          uint32_t rowStore = (blockIdx.y *BYNum + warpY) * WMMA_M;
          uint32_t colStore = (blockIdx.x *BXNum + warpX) * WMMA_N;
          //ldm 源數據的主序參數
          wmma::store_matrix_sync(C + rowStore*N + colStore,CFrag,N,wmma::mem_row_major);
      }
      
      

      4. 優化二:

      • 一個 warp 計算 2 * 4 個 warp 的數據
        image
      點擊查看代碼
      //優化二:共享內存 + warpOfBlocksize(2,4): 一個warp 處理的數據為 2*4=8個warp的數據,K 方向步長 WMMA_K
      // block 中 warpShape(2,4),一個warp 處理的8個warp數據維度 (4,2),一個block 處理 128*128的數據
      //一份共享內存的數據計算多個位置的結果。
      template<const uint32_t WMMA_M=16,const uint32_t WMMA_N=16,const uint32_t WMMA_K=16,
      const uint32_t BXNum=2,const uint32_t BYNum=4,const uint32_t WarpXNum=4,const uint32_t WarpYNum =2>
      __global__ void hgemm_wmma_m16n16k16_block2x4_wmma4x2_kernel(half *A,half *B,half *C, int M,int N,int K)
      {
          //block內id
          uint32_t bid = threadIdx.y * blockDim.x + threadIdx.x;
          
          //線程計算結果歸屬于哪個 warpOfblock
          //warp shape (2,4) x 方向維度為2,y 方向維度為4
          /*
          warp0 | warp1
          warp2 | warp3
          warp4 | warp6
          warp6 | warp7
          */
          uint32_t warpY = bid/(32*BXNum);
          uint32_t warpX = (bid/32)%2;
      
          //一個block 計算結果的大小
          const uint32_t BM = BYNum*WarpYNum*WMMA_M;
          const uint32_t BN = BXNum*WarpXNum*WMMA_N;
          const uint32_t BK = WMMA_K;
          // 共享內存,存放 K 方向一個步長內需要的數據
          __shared__ half sharedA[BM][BK];
          __shared__ half sharedB[BK][BN];
      
          //每個線程取數據個數
          //A 
          uint32_t nFetchANum = BM*BK/(blockDim.y*blockDim.x); //8 一個float4
          //B
          uint32_t nFetchBNum = BK*BN/(blockDim.y*blockDim.x); //8 一個float4
      
          //計算一個步長內取數據到共享內存的線程排布坐標
          //A 一行需要 2個線程 
          uint32_t threadsPerRowA = BK/nFetchANum;
          //在一個步長內取A數據的線程排布的坐標
          uint32_t threadRowA = bid/threadsPerRowA;
          uint32_t threadColA = bid%threadsPerRowA;
          
          //B 一行需要 16個線程
          uint32_t threadsPerRowB = BN/nFetchBNum;
          //在一個步長內取B數據的線程排布的坐標
          uint32_t threadRowB = bid/threadsPerRowB;
          uint32_t threadColB = bid%threadsPerRowB;
          
          //定義wmma 計算對象
          wmma::fragment<wmma::accumulator,WMMA_M,WMMA_N,WMMA_K,half> CFrag[WarpYNum][WarpXNum];
          for(int i=0;i<WarpYNum;i++)
          {
              for(int j=0;j<WarpXNum;j++)
                  wmma::fill_fragment(CFrag[i][j],0.0);
          }
      
          wmma::fragment<wmma::matrix_a,WMMA_M,WMMA_N,WMMA_K,half,wmma::row_major> AFrag[WarpYNum];
          wmma::fragment<wmma::matrix_b,WMMA_M,WMMA_N,WMMA_K,half,wmma::row_major> BFrag[WarpXNum];
      
          // #pragma unroll
          for(int k=0;k<K;k+=WMMA_K)
          {
              //取數據到共享內存
              CAST_FLOAT4(&sharedA[threadRowA][threadColA*nFetchANum])[0] = CAST_FLOAT4(A + (blockIdx.y*BM+threadRowA)*K + (k+threadColA*nFetchANum))[0];
              CAST_FLOAT4(&sharedB[threadRowB][threadColB*nFetchBNum])[0] = CAST_FLOAT4(B + (k+threadRowB)*N + blockIdx.x*BN + threadColB*nFetchBNum)[0];
              __syncthreads();
              //sharedA Y 方向 8個
              //根據該warp 計算結果所在位置,從共享內存取數據進行遍歷
              for(int i=0;i<WarpYNum;i++)
              {   
                  wmma::load_matrix_sync(AFrag[i],&sharedA[(warpY*WarpYNum +i) * WMMA_M][0],BK);
                  for(int j=0;j<WarpXNum;j++)
                  {
                      wmma::load_matrix_sync(BFrag[j],&sharedB[0][(warpX*WarpXNum + j)*WMMA_N],BN);
                      wmma::mma_sync(CFrag[i][j],AFrag[i],BFrag[j],CFrag[i][j]);
                  }
              } 
              __syncthreads();
          }
      
          //儲存結果
          for(int i=0;i<WarpYNum;i++)
          {
              for(int j=0;j<WarpXNum;j++)
              {
                  uint32_t rowStore = (blockIdx.y * BYNum * WarpYNum + warpY*WarpYNum + i) * WMMA_M;
                  uint32_t colStore = (blockIdx.x * BXNum * WarpXNum + warpX*WarpXNum + j) * WMMA_N;
                  //ldm 源數據的主序參數
                  wmma::store_matrix_sync(C + rowStore*N + colStore,CFrag[i][j],N,wmma::mem_row_major);
              }
          }
      }
      

      5. 優化三:

      • 使用 doubleBuffer 和 PTX 指令異步拷貝數據到共享內存
      點擊查看代碼
      //優化三:共享內存 + warpOfBlocksize(2,4): 一個warp 處理的數據為 2*4=8個warp的數據,K 方向步長 WMMA_K
      // block 中 warpShape(2,4),一個warp 處理的8個warp數據維度 (4,2),一個block 處理 128*128的數據
      //一份共享內存的數據計算多個位置的結果。
      // double buffer + 內嵌PTX指令,一個block內的邏輯 warp 間異步拷貝數據到共享內存
      template<const uint32_t WMMA_M=16,const uint32_t WMMA_N=16,const uint32_t WMMA_K=16,
      const uint32_t BXNum=2,const uint32_t BYNum=4,const uint32_t WarpXNum=4,const uint32_t WarpYNum =2,const uint32_t OFFFSET=0>
      __global__ void hgemm_wmma_m16n16k16_block2x4_wmma4x2_dBuff_async_kernel(half *A,half *B,half *C, int M,int N,int K)
      {
          //block內id
          uint32_t bid = threadIdx.y * blockDim.x + threadIdx.x;
          
          //線程計算結果歸屬于哪個 warpOfblock
          //warp shape (2,4) x 方向維度為2,y 方向維度為4
          /*
          warp0 | warp1
          warp2 | warp3
          warp4 | warp6
          warp6 | warp7
          */
          uint32_t warpY = bid/(32*BXNum);
          uint32_t warpX = (bid/32)%2;
      
          //一個block 計算結果的大小
          const uint32_t BM = BYNum*WarpYNum*WMMA_M;
          const uint32_t BN = BXNum*WarpXNum*WMMA_N;
          const uint32_t BK = WMMA_K;
          // 共享內存,存放 K 方向一個步長內需要的數據
          __shared__ half sharedA[2][BM][BK+OFFFSET];
          __shared__ half sharedB[2][BK][BN+OFFFSET];
      
          //每個線程取數據個數
          //A 
          uint32_t nFetchANum = BM*BK/(blockDim.y*blockDim.x); //8 一個float4
          //B
          uint32_t nFetchBNum = BK*BN/(blockDim.y*blockDim.x); //8 一個float4
      
          //計算一個步長內取數據到共享內存的線程排布坐標
          //A 一行需要 2個線程 
          uint32_t threadsPerRowA = BK/nFetchANum;
          //在一個步長內取A數據的線程排布的坐標
          uint32_t threadRowA = bid/threadsPerRowA;
          uint32_t threadColA = bid%threadsPerRowA;
          
          //B 一行需要 16個線程
          uint32_t threadsPerRowB = BN/nFetchBNum;
          //在一個步長內取B數據的線程排布的坐標
          uint32_t threadRowB = bid/threadsPerRowB;
          uint32_t threadColB = bid%threadsPerRowB;
          
          //預取第一輪共享內存數據
          uint32_t writeFlag = 0;
          uint32_t readFlag = 1- writeFlag;
          //獲取共享內存地址偏移量
          uint32_t cp_Offset_A = __cvta_generic_to_shared(&sharedA[writeFlag][threadRowA][threadColA*nFetchANum]);
          uint32_t cp_Offset_B = __cvta_generic_to_shared(&sharedB[writeFlag][threadRowB][threadColB*nFetchBNum]);
          //dst src bytes
          CP_ASYNC_CG(cp_Offset_A, A + (blockIdx.y*BM+threadRowA)*K + threadColA*nFetchANum,16);
          CP_ASYNC_CG(cp_Offset_B, B + threadRowB*N + blockIdx.x*BN + threadColB*nFetchBNum,16);
          //提交異步任務到任務隊列
          CP_ASYNC_COMMIT_GROUP();
          // 同步阻塞等待數據拷貝完成
          CP_ASYNC_WAIT_GROUP(0);
      
          __syncthreads();
      
          //定義wmma 計算對象
          wmma::fragment<wmma::accumulator,WMMA_M,WMMA_N,WMMA_K,half> CFrag[WarpYNum][WarpXNum];
          for(int i=0;i<WarpYNum;i++)
          {
              for(int j=0;j<WarpXNum;j++)
                  wmma::fill_fragment(CFrag[i][j],0.0);
          }
          wmma::fragment<wmma::matrix_a,WMMA_M,WMMA_N,WMMA_K,half,wmma::row_major> AFrag[WarpYNum];
          wmma::fragment<wmma::matrix_b,WMMA_M,WMMA_N,WMMA_K,half,wmma::row_major> BFrag[WarpXNum];
      
          // #pragma unroll
          for(int k=WMMA_K;k<K;k+=WMMA_K)
          {
              // 修改共享內存讀寫標志位
              writeFlag = 1 - writeFlag;
              readFlag = 1 - writeFlag;
      
              //提交任務:下個循環數據拷貝到共享內存
              cp_Offset_A = __cvta_generic_to_shared(&sharedA[writeFlag][threadRowA][threadColA*nFetchANum]);
              cp_Offset_B = __cvta_generic_to_shared(&sharedB[writeFlag][threadRowB][threadColB*nFetchBNum]);
              //dst src bytes
              CP_ASYNC_CG(cp_Offset_A, A + (blockIdx.y*BM+threadRowA)*K + k+threadColA*nFetchANum,16);
              CP_ASYNC_CG(cp_Offset_B, B + (k+threadRowB)*N + blockIdx.x*BN + threadColB*nFetchBNum,16);
              //提交異步任務到任務隊列
              CP_ASYNC_COMMIT_GROUP();
      
              //sharedA Y 方向 8個
              //根據該warp 計算結果所在位置,從共享內存取數據進行遍歷
              for(int i=0;i<WarpYNum;i++)
              {   
                  wmma::load_matrix_sync(AFrag[i],&sharedA[readFlag][(warpY*WarpYNum +i) * WMMA_M][0],BK+OFFFSET);
                  for(int j=0;j<WarpXNum;j++)
                  {
                      wmma::load_matrix_sync(BFrag[j],&sharedB[readFlag][0][(warpX*WarpXNum + j)*WMMA_N],BN+OFFFSET);
                      wmma::mma_sync(CFrag[i][j],AFrag[i],BFrag[j],CFrag[i][j]);
                  }
              }
              
              // 同步阻塞等待數據拷貝完成
              CP_ASYNC_WAIT_GROUP(0);
              // 塊共享內存同步
              __syncthreads();
          }
      
          // 修改共享內存讀寫標志位
          writeFlag = 1 - writeFlag;
          readFlag = 1 - writeFlag;
      
          //計算最后一個步長
          for(int i=0;i<WarpYNum;i++)
          {   
              wmma::load_matrix_sync(AFrag[i],&sharedA[readFlag][(warpY*WarpYNum +i) * WMMA_M][0],BK+OFFFSET);
              for(int j=0;j<WarpXNum;j++)
              {
                  wmma::load_matrix_sync(BFrag[j],&sharedB[readFlag][0][(warpX*WarpXNum + j)*WMMA_N],BN+OFFFSET);
                  wmma::mma_sync(CFrag[i][j],AFrag[i],BFrag[j],CFrag[i][j]);
              }
          }
      
          //儲存結果
          for(int i=0;i<WarpYNum;i++)
          {
              for(int j=0;j<WarpXNum;j++)
              {
                  uint32_t rowStore = (blockIdx.y * BYNum * WarpYNum + warpY*WarpYNum + i) * WMMA_M;
                  uint32_t colStore = (blockIdx.x * BXNum * WarpXNum + warpX*WarpXNum + j) * WMMA_N;
                  //ldm 源數據的主序參數
                  wmma::store_matrix_sync(C + rowStore*N + colStore,CFrag[i][j],N,wmma::mem_row_major);
              }
          }
      }
      

      6. 調用代碼

      點擊查看代碼
      #include <cuda_runtime.h>
      
      #include "common/tester.h"
      #include "common/common.h"
      
      //沒有 half4 用 float2 替代
      #define CAST_HALF4(point) (reinterpret_cast<float2*>(point))
      #define CAST_HALF2(point) (reinterpret_cast<half2*>(point))
      #define CAST_FLOAT4(point) (reinterpret_cast<float4*>(point))
      
      using namespace nvcuda;
      
      
      void hgemm_wmma_m16n16k16_naive(half *A,half *B,half *C, int M,int N,int K)
      {
          // 設置 warp 處理數據 shape
          const uint32_t WMMA_M = 16;    
          const uint32_t WMMA_N = 16;
          const uint32_t WMMA_K = 16;
          dim3 blockSize(32);
          dim3 gridSize(div_ceil(N,WMMA_N),div_ceil(M,WMMA_M));
          hgemm_wmma_m16n16k16_naive_kernel<WMMA_M,WMMA_N,WMMA_K><<<gridSize,blockSize>>>(A,B,C,M,N,K);
      }
      
      void hgemm_wmma_m16n16k16_block2x4(half *A,half *B,half *C, int M,int N,int K)
      {
          // 設置 warp 處理數據 shape
          const uint32_t WMMA_M = 16;    
          const uint32_t WMMA_N = 16;
          const uint32_t WMMA_K = 16;
          const uint32_t nBxNum = 2;
          const uint32_t nByNum = 4;
          // 計算warpOfBlcok(2,4) 計算結果64*32
          dim3 blockSize(8,32); 
          dim3 gridSize(div_ceil(N,WMMA_N*nBxNum),div_ceil(M,WMMA_M*nByNum));
          hgemm_wmma_m16n16k16_block2x4_kernel<WMMA_M,WMMA_N,WMMA_K,nBxNum,nByNum><<<gridSize,blockSize>>>(A,B,C,M,N,K);
      }
      
      void hgemm_wmma_m16n16k16_block2x4_wmma4x2(half *A,half *B,half *C, int M,int N,int K)
      {
          // 設置 warp 處理數據 shape
          const uint32_t WMMA_M = 16;    
          const uint32_t WMMA_N = 16;
          const uint32_t WMMA_K = 16;
          const uint32_t nBxNum = 2;
          const uint32_t nByNum = 4;
          const uint32_t nWarpXNum = 4;
          const uint32_t nWarpYNum = 2;
          // 計算warpOfBlcok(2,4) 計算結果 128*128
          dim3 blockSize(8,32); 
          dim3 gridSize(div_ceil(N,WMMA_N*nBxNum*nWarpXNum),div_ceil(M,WMMA_M*nByNum*nWarpYNum));
          hgemm_wmma_m16n16k16_block2x4_wmma4x2_kernel<WMMA_M,WMMA_N,WMMA_K,nBxNum,nByNum,nWarpXNum,nWarpYNum><<<gridSize,blockSize>>>(A,B,C,M,N,K);
      }
      
      void hgemm_wmma_m16n16k16_block2x4_wmma4x2_dBuff_async(half *A,half *B,half *C, int M,int N,int K)
      {
          // 設置 warp 處理數據 shape
          const uint32_t WMMA_M = 16;    
          const uint32_t WMMA_N = 16;
          const uint32_t WMMA_K = 16;
          const uint32_t nBxNum = 2;
          const uint32_t nByNum = 4;
          const uint32_t nWarpXNum = 4;
          const uint32_t nWarpYNum = 2;
          // 計算warpOfBlcok(2,4) 計算結果 128*128 
          dim3 blockSize(8,32); 
          dim3 gridSize(div_ceil(N,WMMA_N*nBxNum*nWarpXNum),div_ceil(M,WMMA_M*nByNum*nWarpYNum));
          hgemm_wmma_m16n16k16_block2x4_wmma4x2_dBuff_async_kernel<WMMA_M,WMMA_N,WMMA_K,nBxNum,nByNum,nWarpXNum,nWarpYNum,8><<<gridSize,blockSize>>>(A,B,C,M,N,K);
      }
      
      int main(int argc, char** argv)
      {
          {
              Tester tester(512,2048,1024,1,10,100,true);
              tester.evaluate(hgemm_wmma_m16n16k16_naive,"hgemm_wmma_m16n16k16_naive");
          }
      
          {
              Tester tester(512,2048,1024,1,10,100,true);
              tester.evaluate(hgemm_wmma_m16n16k16_block2x4,"hgemm_wmma_m16n16k16_block2x4");
          }
      
          {
              Tester tester(512,2048,1024,1,10,100,true);
              tester.evaluate(hgemm_wmma_m16n16k16_block2x4_wmma4x2,"hgemm_wmma_m16n16k16_block2x4_wmma4x2");
          }
      
          {
              Tester tester(512,2048,1024,1,10,100,true);
              tester.evaluate(hgemm_wmma_m16n16k16_block2x4_wmma4x2_dBuff_async,"hgemm_wmma_m16n16k16_block2x4_wmma4x2_dBuff_async");
          }
      
          return 0;
      }
      
      
      posted @ 2025-09-16 17:21  安洛8  閱讀(18)  評論(0)    收藏  舉報
      主站蜘蛛池模板: 精品亚洲一区二区三区四区| 天堂а√在线最新版中文在线| 中文字幕国产精品日韩| 国产av综合一区二区三区| 最新日韩精品视频在线| 国自产在线精品一本无码中文| 亚洲综合无码一区二区| 国产裸体永久免费无遮挡| 海林市| 久久不见久久见免费视频| 国产一区二区不卡在线| 免费现黄频在线观看国产| 亚洲日韩成人无码不卡网站| 丁香五月激情图片| 国产精品国产片在线观看| 午夜dv内射一区二区| 国产一区二区午夜福利久久| 久久精品国产99国产精品严洲| 国产网友愉拍精品视频手机| 2021亚洲va在线va天堂va国产| 四虎国产精品成人| 精品亚洲国产成人性色av| 日本极品少妇videossexhd| 日韩中文字幕综合第二页| 日日躁夜夜躁狠狠躁超碰97| 精品一区二区免费不卡| 强奷白丝美女在线观看| 亚洲国产欧美在线人成| 国产精品黄色片在线观看| 国产精品av中文字幕| 亚洲色一色噜一噜噜噜| 九九热精品视频在线免费| 国产精品无码素人福利不卡| 国产在线无码不卡播放| 巴楚县| 国产免费高清69式视频在线观看 | 国产精品亚洲а∨天堂2021| 丰满人妻被黑人猛烈进入| 益阳市| 国产av一区二区不卡| 国产自产av一区二区三区性色|