stdlib_lapack_cosine_sine2.fypp Source File


Source Code

#:include "common.fypp" 
submodule(stdlib_lapack_eig_svd_lsq) stdlib_lapack_cosine_sine2
  implicit none


  contains
#:for ik,it,ii in LINALG_INT_KINDS_TYPES

     recursive module subroutine stdlib${ii}$_sorcsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, x11, &
     !! SORCSD computes the CS decomposition of an M-by-M partitioned
     !! orthogonal matrix X:
     !! [  I  0  0 |  0  0  0 ]
     !! [  0  C  0 |  0 -S  0 ]
     !! [ X11 | X12 ]   [ U1 |    ] [  0  0  0 |  0  0 -I ] [ V1 |    ]**T
     !! X = [-----------] = [---------] [---------------------] [---------]   .
     !! [ X21 | X22 ]   [    | U2 ] [  0  0  0 |  I  0  0 ] [    | V2 ]
     !! [  0  S  0 |  0  C  0 ]
     !! [  0  0  I |  0  0  0 ]
     !! X11 is P-by-Q. The orthogonal matrices U1, U2, V1, and V2 are P-by-P,
     !! (M-P)-by-(M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. C and S are
     !! R-by-R nonnegative diagonal matrices satisfying C^2 + S^2 = I, in
     !! which R = MIN(P,M-P,Q,M-Q).
     ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, &
               work, lwork, iwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, signs, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, ldx11, ldx12, ldx21, ldx22, &
                     lwork, m, p, q
           ! Array Arguments 
           integer(${ik}$), intent(out) :: iwork(*)
           real(sp), intent(out) :: theta(*)
           real(sp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*), work(*)
                     
           real(sp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
                     
        ! ===================================================================
           
           ! Local Arrays 
           real(sp) :: dummy(1_${ik}$)
           ! Local Scalars 
           character :: transt, signst
           integer(${ik}$) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, itauq2, j, lbbcsdwork, &
           lbbcsdworkmin, lbbcsdworkopt, lorbdbwork, lorbdbworkmin, lorbdbworkopt, lorglqwork, &
           lorglqworkmin, lorglqworkopt, lorgqrwork, lorgqrworkmin, lorgqrworkopt, lworkmin, &
                     lworkopt
           logical(lk) :: colmajor, defaultsigns, lquery, wantu1, wantu2, wantv1t, wantv2t
           ! Intrinsic Functions
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           defaultsigns = .not. stdlib_lsame( signs, 'O' )
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -7_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -8_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -9_${ik}$
           else if ( colmajor .and.  ldx11 < max( 1_${ik}$, p ) ) then
             info = -11_${ik}$
           else if (.not. colmajor .and. ldx11 < max( 1_${ik}$, q ) ) then
             info = -11_${ik}$
           else if (colmajor .and. ldx12 < max( 1_${ik}$, p ) ) then
             info = -13_${ik}$
           else if (.not. colmajor .and. ldx12 < max( 1_${ik}$, m-q ) ) then
             info = -13_${ik}$
           else if (colmajor .and. ldx21 < max( 1_${ik}$, m-p ) ) then
             info = -15_${ik}$
           else if (.not. colmajor .and. ldx21 < max( 1_${ik}$, q ) ) then
             info = -15_${ik}$
           else if (colmajor .and. ldx22 < max( 1_${ik}$, m-p ) ) then
             info = -17_${ik}$
           else if (.not. colmajor .and. ldx22 < max( 1_${ik}$, m-q ) ) then
             info = -17_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -20_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -22_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -24_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -26_${ik}$
           end if
           ! work with transpose if convenient
           if( info == 0_${ik}$ .and. min( p, m-p ) < min( q, m-q ) ) then
              if( colmajor ) then
                 transt = 'T'
              else
                 transt = 'N'
              end if
              if( defaultsigns ) then
                 signst = 'O'
              else
                 signst = 'D'
              end if
              call stdlib${ii}$_sorcsd( jobv1t, jobv2t, jobu1, jobu2, transt, signst, m,q, p, x11, &
              ldx11, x21, ldx21, x12, ldx12, x22,ldx22, theta, v1t, ldv1t, v2t, ldv2t, u1, ldu1,&
                        u2, ldu2, work, lwork, iwork, info )
              return
           end if
           ! work with permutation [ 0 i; i 0 ] * x * [ 0 i; i 0 ] if
           ! convenient
           if( info == 0_${ik}$ .and. m-q < q ) then
              if( defaultsigns ) then
                 signst = 'O'
              else
                 signst = 'D'
              end if
              call stdlib${ii}$_sorcsd( jobu2, jobu1, jobv2t, jobv1t, trans, signst, m,m-p, m-q, x22, &
              ldx22, x21, ldx21, x12, ldx12, x11,ldx11, theta, u2, ldu2, u1, ldu1, v2t, ldv2t, &
                        v1t,ldv1t, work, lwork, iwork, info )
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              iphi = 2_${ik}$
              itaup1 = iphi + max( 1_${ik}$, q - 1_${ik}$ )
              itaup2 = itaup1 + max( 1_${ik}$, p )
              itauq1 = itaup2 + max( 1_${ik}$, m - p )
              itauq2 = itauq1 + max( 1_${ik}$, q )
              iorgqr = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_sorgqr( m-q, m-q, m-q, dummy, max(1_${ik}$,m-q), dummy, work, -1_${ik}$,childinfo )
                        
              lorgqrworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorgqrworkmin = max( 1_${ik}$, m - q )
              iorglq = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_sorglq( m-q, m-q, m-q, dummy, max(1_${ik}$,m-q), dummy, work, -1_${ik}$,childinfo )
                        
              lorglqworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorglqworkmin = max( 1_${ik}$, m - q )
              iorbdb = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_sorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
                        ldx22, dummy, dummy, dummy, dummy, dummy,dummy,work,-1_${ik}$,childinfo )
              lorbdbworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorbdbworkmin = lorbdbworkopt
              ib11d = itauq2 + max( 1_${ik}$, m - q )
              ib11e = ib11d + max( 1_${ik}$, q )
              ib12d = ib11e + max( 1_${ik}$, q - 1_${ik}$ )
              ib12e = ib12d + max( 1_${ik}$, q )
              ib21d = ib12e + max( 1_${ik}$, q - 1_${ik}$ )
              ib21e = ib21d + max( 1_${ik}$, q )
              ib22d = ib21e + max( 1_${ik}$, q - 1_${ik}$ )
              ib22e = ib22d + max( 1_${ik}$, q )
              ibbcsd = ib22e + max( 1_${ik}$, q - 1_${ik}$ )
              call stdlib${ii}$_sbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,dummy, dummy, u1, &
              ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, dummy, dummy, dummy, dummy, dummy, dummy,&
                        dummy, dummy, work, -1_${ik}$, childinfo )
              lbbcsdworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lbbcsdworkmin = lbbcsdworkopt
              lworkopt = max( iorgqr + lorgqrworkopt, iorglq + lorglqworkopt,iorbdb + &
                        lorbdbworkopt, ibbcsd + lbbcsdworkopt ) - 1_${ik}$
              lworkmin = max( iorgqr + lorgqrworkmin, iorglq + lorglqworkmin,iorbdb + &
                        lorbdbworkopt, ibbcsd + lbbcsdworkmin ) - 1_${ik}$
              work(1_${ik}$) = max(lworkopt,lworkmin)
              if( lwork < lworkmin .and. .not. lquery ) then
                 info = -22_${ik}$
              else
                 lorgqrwork = lwork - iorgqr + 1_${ik}$
                 lorglqwork = lwork - iorglq + 1_${ik}$
                 lorbdbwork = lwork - iorbdb + 1_${ik}$
                 lbbcsdwork = lwork - ibbcsd + 1_${ik}$
              end if
           end if
           ! abort if any illegal arguments
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'SORCSD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! transform to bidiagonal block form
           call stdlib${ii}$_sorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12, x21,ldx21, x22, &
           ldx22, theta, work(iphi), work(itaup1),work(itaup2), work(itauq1), work(itauq2),work(&
                     iorbdb), lorbdbwork, childinfo )
           ! accumulate householder reflectors
           if( colmajor ) then
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_sorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqrwork, &
                           info)
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_sorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqrwork,&
                            info )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'U', q-1, q-1, x11(1_${ik}$,2_${ik}$), ldx11, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 v1t(1_${ik}$, 1_${ik}$) = one
                 do j = 2, q
                    v1t(1_${ik}$,j) = zero
                    v1t(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_sorglq( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorglq), &
                           lorglqwork, info )
              end if
              if( wantv2t .and. m-q > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'U', p, m-q, x12, ldx12, v2t, ldv2t )
                 call stdlib${ii}$_slacpy( 'U', m-p-q, m-p-q, x22(q+1,p+1), ldx22,v2t(p+1,p+1), ldv2t )
                           
                 call stdlib${ii}$_sorglq( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorglq), &
                           lorglqwork, info )
              end if
           else
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'U', q, p, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_sorglq( p, p, q, u1, ldu1, work(itaup1), work(iorglq),lorglqwork, &
                           info)
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'U', q, m-p, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_sorglq( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorglq), lorglqwork,&
                            info )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'L', q-1, q-1, x11(2_${ik}$,1_${ik}$), ldx11, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 v1t(1_${ik}$, 1_${ik}$) = one
                 do j = 2, q
                    v1t(1_${ik}$,j) = zero
                    v1t(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_sorgqr( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorgqr), &
                           lorgqrwork, info )
              end if
              if( wantv2t .and. m-q > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'L', m-q, p, x12, ldx12, v2t, ldv2t )
                 call stdlib${ii}$_slacpy( 'L', m-p-q, m-p-q, x22(p+1,q+1), ldx22,v2t(p+1,p+1), ldv2t )
                           
                 call stdlib${ii}$_sorgqr( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorgqr), &
                           lorgqrwork, info )
              end if
           end if
           ! compute the csd of the matrix in bidiagonal-block form
           call stdlib${ii}$_sbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q, theta,work(iphi), u1,&
            ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, work(ib11d), work(ib11e), work(ib12d),work(&
            ib12e), work(ib21d), work(ib21e), work(ib22d),work(ib22e), work(ibbcsd), lbbcsdwork, &
                      info )
           ! permute rows and columns to place identity submatrices in top-
           ! left corner of (1,1)-block and/or bottom-right corner of (1,2)-
           ! block and/or bottom-right corner of (2,1)-block and/or top-left
           ! corner of (2,2)-block
           if( q > 0_${ik}$ .and. wantu2 ) then
              do i = 1, q
                 iwork(i) = m - p - q + i
              end do
              do i = q + 1, m - p
                 iwork(i) = i - q
              end do
              if( colmajor ) then
                 call stdlib${ii}$_slapmt( .false., m-p, m-p, u2, ldu2, iwork )
              else
                 call stdlib${ii}$_slapmr( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           end if
           if( m > 0_${ik}$ .and. wantv2t ) then
              do i = 1, p
                 iwork(i) = m - p - q + i
              end do
              do i = p + 1, m - q
                 iwork(i) = i - p
              end do
              if( .not. colmajor ) then
                 call stdlib${ii}$_slapmt( .false., m-q, m-q, v2t, ldv2t, iwork )
              else
                 call stdlib${ii}$_slapmr( .false., m-q, m-q, v2t, ldv2t, iwork )
              end if
           end if
           return
           ! end stdlib${ii}$_sorcsd
     end subroutine stdlib${ii}$_sorcsd

     recursive module subroutine stdlib${ii}$_dorcsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, x11, &
     !! DORCSD computes the CS decomposition of an M-by-M partitioned
     !! orthogonal matrix X:
     !! [  I  0  0 |  0  0  0 ]
     !! [  0  C  0 |  0 -S  0 ]
     !! [ X11 | X12 ]   [ U1 |    ] [  0  0  0 |  0  0 -I ] [ V1 |    ]**T
     !! X = [-----------] = [---------] [---------------------] [---------]   .
     !! [ X21 | X22 ]   [    | U2 ] [  0  0  0 |  I  0  0 ] [    | V2 ]
     !! [  0  S  0 |  0  C  0 ]
     !! [  0  0  I |  0  0  0 ]
     !! X11 is P-by-Q. The orthogonal matrices U1, U2, V1, and V2 are P-by-P,
     !! (M-P)-by-(M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. C and S are
     !! R-by-R nonnegative diagonal matrices satisfying C^2 + S^2 = I, in
     !! which R = MIN(P,M-P,Q,M-Q).
     ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, &
               work, lwork, iwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, signs, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, ldx11, ldx12, ldx21, ldx22, &
                     lwork, m, p, q
           ! Array Arguments 
           integer(${ik}$), intent(out) :: iwork(*)
           real(dp), intent(out) :: theta(*)
           real(dp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*), work(*)
                     
           real(dp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
                     
        ! ===================================================================
           
           ! Local Scalars 
           character :: transt, signst
           integer(${ik}$) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, itauq2, j, lbbcsdwork, &
           lbbcsdworkmin, lbbcsdworkopt, lorbdbwork, lorbdbworkmin, lorbdbworkopt, lorglqwork, &
           lorglqworkmin, lorglqworkopt, lorgqrwork, lorgqrworkmin, lorgqrworkopt, lworkmin, &
                     lworkopt
           logical(lk) :: colmajor, defaultsigns, lquery, wantu1, wantu2, wantv1t, wantv2t
           ! Intrinsic Functions
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           defaultsigns = .not. stdlib_lsame( signs, 'O' )
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -7_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -8_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -9_${ik}$
           else if ( colmajor .and.  ldx11 < max( 1_${ik}$, p ) ) then
             info = -11_${ik}$
           else if (.not. colmajor .and. ldx11 < max( 1_${ik}$, q ) ) then
             info = -11_${ik}$
           else if (colmajor .and. ldx12 < max( 1_${ik}$, p ) ) then
             info = -13_${ik}$
           else if (.not. colmajor .and. ldx12 < max( 1_${ik}$, m-q ) ) then
             info = -13_${ik}$
           else if (colmajor .and. ldx21 < max( 1_${ik}$, m-p ) ) then
             info = -15_${ik}$
           else if (.not. colmajor .and. ldx21 < max( 1_${ik}$, q ) ) then
             info = -15_${ik}$
           else if (colmajor .and. ldx22 < max( 1_${ik}$, m-p ) ) then
             info = -17_${ik}$
           else if (.not. colmajor .and. ldx22 < max( 1_${ik}$, m-q ) ) then
             info = -17_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -20_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -22_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -24_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -26_${ik}$
           end if
           ! work with transpose if convenient
           if( info == 0_${ik}$ .and. min( p, m-p ) < min( q, m-q ) ) then
              if( colmajor ) then
                 transt = 'T'
              else
                 transt = 'N'
              end if
              if( defaultsigns ) then
                 signst = 'O'
              else
                 signst = 'D'
              end if
              call stdlib${ii}$_dorcsd( jobv1t, jobv2t, jobu1, jobu2, transt, signst, m,q, p, x11, &
              ldx11, x21, ldx21, x12, ldx12, x22,ldx22, theta, v1t, ldv1t, v2t, ldv2t, u1, ldu1,&
                        u2, ldu2, work, lwork, iwork, info )
              return
           end if
           ! work with permutation [ 0 i; i 0 ] * x * [ 0 i; i 0 ] if
           ! convenient
           if( info == 0_${ik}$ .and. m-q < q ) then
              if( defaultsigns ) then
                 signst = 'O'
              else
                 signst = 'D'
              end if
              call stdlib${ii}$_dorcsd( jobu2, jobu1, jobv2t, jobv1t, trans, signst, m,m-p, m-q, x22, &
              ldx22, x21, ldx21, x12, ldx12, x11,ldx11, theta, u2, ldu2, u1, ldu1, v2t, ldv2t, &
                        v1t,ldv1t, work, lwork, iwork, info )
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              iphi = 2_${ik}$
              itaup1 = iphi + max( 1_${ik}$, q - 1_${ik}$ )
              itaup2 = itaup1 + max( 1_${ik}$, p )
              itauq1 = itaup2 + max( 1_${ik}$, m - p )
              itauq2 = itauq1 + max( 1_${ik}$, q )
              iorgqr = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_dorgqr( m-q, m-q, m-q, u1, max(1_${ik}$,m-q), u1, work, -1_${ik}$,childinfo )
              lorgqrworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorgqrworkmin = max( 1_${ik}$, m - q )
              iorglq = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_dorglq( m-q, m-q, m-q, u1, max(1_${ik}$,m-q), u1, work, -1_${ik}$,childinfo )
              lorglqworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorglqworkmin = max( 1_${ik}$, m - q )
              iorbdb = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_dorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
                        ldx22, theta, v1t, u1, u2, v1t,v2t, work, -1_${ik}$, childinfo )
              lorbdbworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorbdbworkmin = lorbdbworkopt
              ib11d = itauq2 + max( 1_${ik}$, m - q )
              ib11e = ib11d + max( 1_${ik}$, q )
              ib12d = ib11e + max( 1_${ik}$, q - 1_${ik}$ )
              ib12e = ib12d + max( 1_${ik}$, q )
              ib21d = ib12e + max( 1_${ik}$, q - 1_${ik}$ )
              ib21e = ib21d + max( 1_${ik}$, q )
              ib22d = ib21e + max( 1_${ik}$, q - 1_${ik}$ )
              ib22e = ib22d + max( 1_${ik}$, q )
              ibbcsd = ib22e + max( 1_${ik}$, q - 1_${ik}$ )
              call stdlib${ii}$_dbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, theta, u1, &
              ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, u1, u1, u1, u1, u1, u1, u1, u1, work, -1_${ik}$,&
                        childinfo )
              lbbcsdworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lbbcsdworkmin = lbbcsdworkopt
              lworkopt = max( iorgqr + lorgqrworkopt, iorglq + lorglqworkopt,iorbdb + &
                        lorbdbworkopt, ibbcsd + lbbcsdworkopt ) - 1_${ik}$
              lworkmin = max( iorgqr + lorgqrworkmin, iorglq + lorglqworkmin,iorbdb + &
                        lorbdbworkopt, ibbcsd + lbbcsdworkmin ) - 1_${ik}$
              work(1_${ik}$) = max(lworkopt,lworkmin)
              if( lwork < lworkmin .and. .not. lquery ) then
                 info = -22_${ik}$
              else
                 lorgqrwork = lwork - iorgqr + 1_${ik}$
                 lorglqwork = lwork - iorglq + 1_${ik}$
                 lorbdbwork = lwork - iorbdb + 1_${ik}$
                 lbbcsdwork = lwork - ibbcsd + 1_${ik}$
              end if
           end if
           ! abort if any illegal arguments
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'DORCSD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! transform to bidiagonal block form
           call stdlib${ii}$_dorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12, x21,ldx21, x22, &
           ldx22, theta, work(iphi), work(itaup1),work(itaup2), work(itauq1), work(itauq2),work(&
                     iorbdb), lorbdbwork, childinfo )
           ! accumulate householder reflectors
           if( colmajor ) then
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_dorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqrwork, &
                           info)
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_dorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqrwork,&
                            info )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'U', q-1, q-1, x11(1_${ik}$,2_${ik}$), ldx11, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 v1t(1_${ik}$, 1_${ik}$) = one
                 do j = 2, q
                    v1t(1_${ik}$,j) = zero
                    v1t(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_dorglq( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorglq), &
                           lorglqwork, info )
              end if
              if( wantv2t .and. m-q > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'U', p, m-q, x12, ldx12, v2t, ldv2t )
                 if (m-p > q) then
                    call stdlib${ii}$_dlacpy( 'U', m-p-q, m-p-q, x22(q+1,p+1), ldx22,v2t(p+1,p+1), &
                              ldv2t )
                 end if
                 if (m > q) then
                    call stdlib${ii}$_dorglq( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorglq), &
                              lorglqwork, info )
                 end if
              end if
           else
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'U', q, p, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_dorglq( p, p, q, u1, ldu1, work(itaup1), work(iorglq),lorglqwork, &
                           info)
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'U', q, m-p, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_dorglq( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorglq), lorglqwork,&
                            info )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'L', q-1, q-1, x11(2_${ik}$,1_${ik}$), ldx11, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 v1t(1_${ik}$, 1_${ik}$) = one
                 do j = 2, q
                    v1t(1_${ik}$,j) = zero
                    v1t(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_dorgqr( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorgqr), &
                           lorgqrwork, info )
              end if
              if( wantv2t .and. m-q > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'L', m-q, p, x12, ldx12, v2t, ldv2t )
                 call stdlib${ii}$_dlacpy( 'L', m-p-q, m-p-q, x22(p+1,q+1), ldx22,v2t(p+1,p+1), ldv2t )
                           
                 call stdlib${ii}$_dorgqr( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorgqr), &
                           lorgqrwork, info )
              end if
           end if
           ! compute the csd of the matrix in bidiagonal-block form
           call stdlib${ii}$_dbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q, theta,work(iphi), u1,&
            ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, work(ib11d), work(ib11e), work(ib12d),work(&
            ib12e), work(ib21d), work(ib21e), work(ib22d),work(ib22e), work(ibbcsd), lbbcsdwork, &
                      info )
           ! permute rows and columns to place identity submatrices in top-
           ! left corner of (1,1)-block and/or bottom-right corner of (1,2)-
           ! block and/or bottom-right corner of (2,1)-block and/or top-left
           ! corner of (2,2)-block
           if( q > 0_${ik}$ .and. wantu2 ) then
              do i = 1, q
                 iwork(i) = m - p - q + i
              end do
              do i = q + 1, m - p
                 iwork(i) = i - q
              end do
              if( colmajor ) then
                 call stdlib${ii}$_dlapmt( .false., m-p, m-p, u2, ldu2, iwork )
              else
                 call stdlib${ii}$_dlapmr( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           end if
           if( m > 0_${ik}$ .and. wantv2t ) then
              do i = 1, p
                 iwork(i) = m - p - q + i
              end do
              do i = p + 1, m - q
                 iwork(i) = i - p
              end do
              if( .not. colmajor ) then
                 call stdlib${ii}$_dlapmt( .false., m-q, m-q, v2t, ldv2t, iwork )
              else
                 call stdlib${ii}$_dlapmr( .false., m-q, m-q, v2t, ldv2t, iwork )
              end if
           end if
           return
           ! end stdlib${ii}$_dorcsd
     end subroutine stdlib${ii}$_dorcsd

#:for rk,rt,ri in REAL_KINDS_TYPES
#:if not rk in ["sp","dp"]
     recursive module subroutine stdlib${ii}$_${ri}$orcsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, x11, &
     !! DORCSD: computes the CS decomposition of an M-by-M partitioned
     !! orthogonal matrix X:
     !! [  I  0  0 |  0  0  0 ]
     !! [  0  C  0 |  0 -S  0 ]
     !! [ X11 | X12 ]   [ U1 |    ] [  0  0  0 |  0  0 -I ] [ V1 |    ]**T
     !! X = [-----------] = [---------] [---------------------] [---------]   .
     !! [ X21 | X22 ]   [    | U2 ] [  0  0  0 |  I  0  0 ] [    | V2 ]
     !! [  0  S  0 |  0  C  0 ]
     !! [  0  0  I |  0  0  0 ]
     !! X11 is P-by-Q. The orthogonal matrices U1, U2, V1, and V2 are P-by-P,
     !! (M-P)-by-(M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. C and S are
     !! R-by-R nonnegative diagonal matrices satisfying C^2 + S^2 = I, in
     !! which R = MIN(P,M-P,Q,M-Q).
     ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, &
               work, lwork, iwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${rk}$, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, signs, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, ldx11, ldx12, ldx21, ldx22, &
                     lwork, m, p, q
           ! Array Arguments 
           integer(${ik}$), intent(out) :: iwork(*)
           real(${rk}$), intent(out) :: theta(*)
           real(${rk}$), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*), work(*)
                     
           real(${rk}$), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
                     
        ! ===================================================================
           
           ! Local Scalars 
           character :: transt, signst
           integer(${ik}$) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, itauq2, j, lbbcsdwork, &
           lbbcsdworkmin, lbbcsdworkopt, lorbdbwork, lorbdbworkmin, lorbdbworkopt, lorglqwork, &
           lorglqworkmin, lorglqworkopt, lorgqrwork, lorgqrworkmin, lorgqrworkopt, lworkmin, &
                     lworkopt
           logical(lk) :: colmajor, defaultsigns, lquery, wantu1, wantu2, wantv1t, wantv2t
           ! Intrinsic Functions
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           defaultsigns = .not. stdlib_lsame( signs, 'O' )
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -7_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -8_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -9_${ik}$
           else if ( colmajor .and.  ldx11 < max( 1_${ik}$, p ) ) then
             info = -11_${ik}$
           else if (.not. colmajor .and. ldx11 < max( 1_${ik}$, q ) ) then
             info = -11_${ik}$
           else if (colmajor .and. ldx12 < max( 1_${ik}$, p ) ) then
             info = -13_${ik}$
           else if (.not. colmajor .and. ldx12 < max( 1_${ik}$, m-q ) ) then
             info = -13_${ik}$
           else if (colmajor .and. ldx21 < max( 1_${ik}$, m-p ) ) then
             info = -15_${ik}$
           else if (.not. colmajor .and. ldx21 < max( 1_${ik}$, q ) ) then
             info = -15_${ik}$
           else if (colmajor .and. ldx22 < max( 1_${ik}$, m-p ) ) then
             info = -17_${ik}$
           else if (.not. colmajor .and. ldx22 < max( 1_${ik}$, m-q ) ) then
             info = -17_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -20_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -22_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -24_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -26_${ik}$
           end if
           ! work with transpose if convenient
           if( info == 0_${ik}$ .and. min( p, m-p ) < min( q, m-q ) ) then
              if( colmajor ) then
                 transt = 'T'
              else
                 transt = 'N'
              end if
              if( defaultsigns ) then
                 signst = 'O'
              else
                 signst = 'D'
              end if
              call stdlib${ii}$_${ri}$orcsd( jobv1t, jobv2t, jobu1, jobu2, transt, signst, m,q, p, x11, &
              ldx11, x21, ldx21, x12, ldx12, x22,ldx22, theta, v1t, ldv1t, v2t, ldv2t, u1, ldu1,&
                        u2, ldu2, work, lwork, iwork, info )
              return
           end if
           ! work with permutation [ 0 i; i 0 ] * x * [ 0 i; i 0 ] if
           ! convenient
           if( info == 0_${ik}$ .and. m-q < q ) then
              if( defaultsigns ) then
                 signst = 'O'
              else
                 signst = 'D'
              end if
              call stdlib${ii}$_${ri}$orcsd( jobu2, jobu1, jobv2t, jobv1t, trans, signst, m,m-p, m-q, x22, &
              ldx22, x21, ldx21, x12, ldx12, x11,ldx11, theta, u2, ldu2, u1, ldu1, v2t, ldv2t, &
                        v1t,ldv1t, work, lwork, iwork, info )
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              iphi = 2_${ik}$
              itaup1 = iphi + max( 1_${ik}$, q - 1_${ik}$ )
              itaup2 = itaup1 + max( 1_${ik}$, p )
              itauq1 = itaup2 + max( 1_${ik}$, m - p )
              itauq2 = itauq1 + max( 1_${ik}$, q )
              iorgqr = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_${ri}$orgqr( m-q, m-q, m-q, u1, max(1_${ik}$,m-q), u1, work, -1_${ik}$,childinfo )
              lorgqrworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorgqrworkmin = max( 1_${ik}$, m - q )
              iorglq = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_${ri}$orglq( m-q, m-q, m-q, u1, max(1_${ik}$,m-q), u1, work, -1_${ik}$,childinfo )
              lorglqworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorglqworkmin = max( 1_${ik}$, m - q )
              iorbdb = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_${ri}$orbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
                        ldx22, theta, v1t, u1, u2, v1t,v2t, work, -1_${ik}$, childinfo )
              lorbdbworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorbdbworkmin = lorbdbworkopt
              ib11d = itauq2 + max( 1_${ik}$, m - q )
              ib11e = ib11d + max( 1_${ik}$, q )
              ib12d = ib11e + max( 1_${ik}$, q - 1_${ik}$ )
              ib12e = ib12d + max( 1_${ik}$, q )
              ib21d = ib12e + max( 1_${ik}$, q - 1_${ik}$ )
              ib21e = ib21d + max( 1_${ik}$, q )
              ib22d = ib21e + max( 1_${ik}$, q - 1_${ik}$ )
              ib22e = ib22d + max( 1_${ik}$, q )
              ibbcsd = ib22e + max( 1_${ik}$, q - 1_${ik}$ )
              call stdlib${ii}$_${ri}$bbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, theta, u1, &
              ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, u1, u1, u1, u1, u1, u1, u1, u1, work, -1_${ik}$,&
                        childinfo )
              lbbcsdworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lbbcsdworkmin = lbbcsdworkopt
              lworkopt = max( iorgqr + lorgqrworkopt, iorglq + lorglqworkopt,iorbdb + &
                        lorbdbworkopt, ibbcsd + lbbcsdworkopt ) - 1_${ik}$
              lworkmin = max( iorgqr + lorgqrworkmin, iorglq + lorglqworkmin,iorbdb + &
                        lorbdbworkopt, ibbcsd + lbbcsdworkmin ) - 1_${ik}$
              work(1_${ik}$) = max(lworkopt,lworkmin)
              if( lwork < lworkmin .and. .not. lquery ) then
                 info = -22_${ik}$
              else
                 lorgqrwork = lwork - iorgqr + 1_${ik}$
                 lorglqwork = lwork - iorglq + 1_${ik}$
                 lorbdbwork = lwork - iorbdb + 1_${ik}$
                 lbbcsdwork = lwork - ibbcsd + 1_${ik}$
              end if
           end if
           ! abort if any illegal arguments
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'DORCSD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! transform to bidiagonal block form
           call stdlib${ii}$_${ri}$orbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12, x21,ldx21, x22, &
           ldx22, theta, work(iphi), work(itaup1),work(itaup2), work(itauq1), work(itauq2),work(&
                     iorbdb), lorbdbwork, childinfo )
           ! accumulate householder reflectors
           if( colmajor ) then
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_${ri}$orgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqrwork, &
                           info)
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_${ri}$orgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqrwork,&
                            info )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'U', q-1, q-1, x11(1_${ik}$,2_${ik}$), ldx11, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 v1t(1_${ik}$, 1_${ik}$) = one
                 do j = 2, q
                    v1t(1_${ik}$,j) = zero
                    v1t(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_${ri}$orglq( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorglq), &
                           lorglqwork, info )
              end if
              if( wantv2t .and. m-q > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'U', p, m-q, x12, ldx12, v2t, ldv2t )
                 if (m-p > q) then
                    call stdlib${ii}$_${ri}$lacpy( 'U', m-p-q, m-p-q, x22(q+1,p+1), ldx22,v2t(p+1,p+1), &
                              ldv2t )
                 end if
                 if (m > q) then
                    call stdlib${ii}$_${ri}$orglq( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorglq), &
                              lorglqwork, info )
                 end if
              end if
           else
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'U', q, p, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_${ri}$orglq( p, p, q, u1, ldu1, work(itaup1), work(iorglq),lorglqwork, &
                           info)
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'U', q, m-p, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_${ri}$orglq( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorglq), lorglqwork,&
                            info )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'L', q-1, q-1, x11(2_${ik}$,1_${ik}$), ldx11, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 v1t(1_${ik}$, 1_${ik}$) = one
                 do j = 2, q
                    v1t(1_${ik}$,j) = zero
                    v1t(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_${ri}$orgqr( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorgqr), &
                           lorgqrwork, info )
              end if
              if( wantv2t .and. m-q > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'L', m-q, p, x12, ldx12, v2t, ldv2t )
                 call stdlib${ii}$_${ri}$lacpy( 'L', m-p-q, m-p-q, x22(p+1,q+1), ldx22,v2t(p+1,p+1), ldv2t )
                           
                 call stdlib${ii}$_${ri}$orgqr( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorgqr), &
                           lorgqrwork, info )
              end if
           end if
           ! compute the csd of the matrix in bidiagonal-block form
           call stdlib${ii}$_${ri}$bbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q, theta,work(iphi), u1,&
            ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, work(ib11d), work(ib11e), work(ib12d),work(&
            ib12e), work(ib21d), work(ib21e), work(ib22d),work(ib22e), work(ibbcsd), lbbcsdwork, &
                      info )
           ! permute rows and columns to place identity submatrices in top-
           ! left corner of (1,1)-block and/or bottom-right corner of (1,2)-
           ! block and/or bottom-right corner of (2,1)-block and/or top-left
           ! corner of (2,2)-block
           if( q > 0_${ik}$ .and. wantu2 ) then
              do i = 1, q
                 iwork(i) = m - p - q + i
              end do
              do i = q + 1, m - p
                 iwork(i) = i - q
              end do
              if( colmajor ) then
                 call stdlib${ii}$_${ri}$lapmt( .false., m-p, m-p, u2, ldu2, iwork )
              else
                 call stdlib${ii}$_${ri}$lapmr( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           end if
           if( m > 0_${ik}$ .and. wantv2t ) then
              do i = 1, p
                 iwork(i) = m - p - q + i
              end do
              do i = p + 1, m - q
                 iwork(i) = i - p
              end do
              if( .not. colmajor ) then
                 call stdlib${ii}$_${ri}$lapmt( .false., m-q, m-q, v2t, ldv2t, iwork )
              else
                 call stdlib${ii}$_${ri}$lapmr( .false., m-q, m-q, v2t, ldv2t, iwork )
              end if
           end if
           return
           ! end stdlib${ii}$_${ri}$orcsd
     end subroutine stdlib${ii}$_${ri}$orcsd

#:endif
#:endfor



     module subroutine stdlib${ii}$_sorcsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta, &
     !! SORCSD2BY1 computes the CS decomposition of an M-by-Q matrix X with
     !! orthonormal columns that has been partitioned into a 2-by-1 block
     !! structure:
     !! [  I1 0  0 ]
     !! [  0  C  0 ]
     !! [ X11 ]   [ U1 |    ] [  0  0  0 ]
     !! X = [-----] = [---------] [----------] V1**T .
     !! [ X21 ]   [    | U2 ] [  0  0  0 ]
     !! [  0  S  0 ]
     !! [  0  0  I2]
     !! X11 is P-by-Q. The orthogonal matrices U1, U2, and V1 are P-by-P,
     !! (M-P)-by-(M-P), and Q-by-Q, respectively. C and S are R-by-R
     !! nonnegative diagonal matrices satisfying C^2 + S^2 = I, in which
     !! R = MIN(P,M-P,Q,M-Q). I1 is a K1-by-K1 identity matrix and I2 is a
     !! K2-by-K2 identity matrix, where K1 = MAX(Q+P-M,0), K2 = MAX(Q-P,0).
               u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, iwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, lwork, ldx11, ldx21, m, p, q
           ! Array Arguments 
           real(sp), intent(out) :: theta(*)
           real(sp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), work(*)
           real(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
           integer(${ik}$), intent(out) :: iwork(*)
        ! =====================================================================
           
           ! Local Scalars 
           integer(${ik}$) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, j, lbbcsd, lorbdb, &
           lorglq, lorglqmin, lorglqopt, lorgqr, lorgqrmin, lorgqropt, lworkmin, lworkopt, &
                     r
           logical(lk) :: lquery, wantu1, wantu2, wantv1t
           ! Local Arrays 
           real(sp) :: dum1(1_${ik}$), dum2(1_${ik}$,1_${ik}$)
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -4_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -5_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -6_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -8_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -10_${ik}$
           else if( wantu1 .and. ldu1 < max( 1_${ik}$, p ) ) then
              info = -13_${ik}$
           else if( wantu2 .and. ldu2 < max( 1_${ik}$, m - p ) ) then
              info = -15_${ik}$
           else if( wantv1t .and. ldv1t < max( 1_${ik}$, q ) ) then
              info = -17_${ik}$
           end if
           r = min( p, m-p, q, m-q )
           ! compute workspace
             ! work layout:
           ! |-------------------------------------------------------|
           ! | lworkopt (1)                                          |
           ! |-------------------------------------------------------|
           ! | phi (max(1,r-1))                                      |
           ! |-------------------------------------------------------|
           ! | taup1 (max(1,p))                        | b11d (r)    |
           ! | taup2 (max(1,m-p))                      | b11e (r-1)  |
           ! | tauq1 (max(1,q))                        | b12d (r)    |
           ! |-----------------------------------------| b12e (r-1)  |
           ! | stdlib${ii}$_sorbdb work | stdlib${ii}$_sorgqr work | stdlib${ii}$_sorglq work | b21d (r)    |
           ! |             |             |             | b21e (r-1)  |
           ! |             |             |             | b22d (r)    |
           ! |             |             |             | b22e (r-1)  |
           ! |             |             |             | stdlib${ii}$_sbbcsd work |
           ! |-------------------------------------------------------|
           if( info == 0_${ik}$ ) then
              iphi = 2_${ik}$
              ib11d = iphi + max( 1_${ik}$, r-1 )
              ib11e = ib11d + max( 1_${ik}$, r )
              ib12d = ib11e + max( 1_${ik}$, r - 1_${ik}$ )
              ib12e = ib12d + max( 1_${ik}$, r )
              ib21d = ib12e + max( 1_${ik}$, r - 1_${ik}$ )
              ib21e = ib21d + max( 1_${ik}$, r )
              ib22d = ib21e + max( 1_${ik}$, r - 1_${ik}$ )
              ib22e = ib22d + max( 1_${ik}$, r )
              ibbcsd = ib22e + max( 1_${ik}$, r - 1_${ik}$ )
              itaup1 = iphi + max( 1_${ik}$, r-1 )
              itaup2 = itaup1 + max( 1_${ik}$, p )
              itauq1 = itaup2 + max( 1_${ik}$, m-p )
              iorbdb = itauq1 + max( 1_${ik}$, q )
              iorgqr = itauq1 + max( 1_${ik}$, q )
              iorglq = itauq1 + max( 1_${ik}$, q )
              lorgqrmin = 1_${ik}$
              lorgqropt = 1_${ik}$
              lorglqmin = 1_${ik}$
              lorglqopt = 1_${ik}$
              if( r == q ) then
                 call stdlib${ii}$_sorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
                           dum1, work, -1_${ik}$,childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_sorgqr( p, p, q, u1, ldu1, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 endif
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_sorgqr( m-p, m-p, q, u2, ldu2, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_sorglq( q-1, q-1, q-1, v1t, ldv1t,dum1, work(1_${ik}$), -1_${ik}$, childinfo )
                              
                    lorglqmin = max( lorglqmin, q-1 )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_sbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,dum1, u1, &
                 ldu1, u2, ldu2, v1t, ldv1t, dum2,1_${ik}$, dum1, dum1, dum1, dum1, dum1,dum1, dum1, &
                           dum1, work(1_${ik}$), -1_${ik}$, childinfo)
                 lbbcsd = int( work(1_${ik}$),KIND=${ik}$)
              else if( r == p ) then
                 call stdlib${ii}$_sorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
                           dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_sorgqr( p-1, p-1, p-1, u1(2_${ik}$,2_${ik}$), ldu1, dum1,work(1_${ik}$), -1_${ik}$, childinfo &
                              )
                    lorgqrmin = max( lorgqrmin, p-1 )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_sorgqr( m-p, m-p, q, u2, ldu2, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_sorglq( q, q, r, v1t, ldv1t, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_sbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,dum1, v1t, &
                 ldv1t, dum2, 1_${ik}$, u1, ldu1, u2,ldu2, dum1, dum1, dum1, dum1, dum1,dum1, dum1, dum1,&
                            work(1_${ik}$), -1_${ik}$, childinfo)
                 lbbcsd = int( work(1_${ik}$),KIND=${ik}$)
              else if( r == m-p ) then
                 call stdlib${ii}$_sorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
                           dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_sorgqr( p, p, q, u1, ldu1, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_sorgqr( m-p-1, m-p-1, m-p-1, u2(2_${ik}$,2_${ik}$), ldu2, dum1,work(1_${ik}$), -1_${ik}$, &
                              childinfo )
                    lorgqrmin = max( lorgqrmin, m-p-1 )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_sorglq( q, q, r, v1t, ldv1t, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_sbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, dum1, &
                 dum2, 1_${ik}$, v1t, ldv1t, u2, ldu2,u1, ldu1, dum1, dum1, dum1, dum1,dum1, dum1, dum1, &
                           dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                 lbbcsd = int( work(1_${ik}$),KIND=${ik}$)
              else
                 call stdlib${ii}$_sorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
                           dum1, dum1,work(1_${ik}$), -1_${ik}$, childinfo )
                 lorbdb = m + int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_sorgqr( p, p, m-q, u1, ldu1, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_sorgqr( m-p, m-p, m-q, u2, ldu2, dum1, work(1_${ik}$),-1_${ik}$, childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_sorglq( q, q, q, v1t, ldv1t, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_sbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, dum1, u2, &
                 ldu2, u1, ldu1, dum2, 1_${ik}$,v1t, ldv1t, dum1, dum1, dum1, dum1,dum1, dum1, dum1, &
                           dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                 lbbcsd = int( work(1_${ik}$),KIND=${ik}$)
              end if
              lworkmin = max( iorbdb+lorbdb-1,iorgqr+lorgqrmin-1,iorglq+lorglqmin-1,ibbcsd+lbbcsd-&
                        1_${ik}$ )
              lworkopt = max( iorbdb+lorbdb-1,iorgqr+lorgqropt-1,iorglq+lorglqopt-1,ibbcsd+lbbcsd-&
                        1_${ik}$ )
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                 info = -19_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'SORCSD2BY1', -info )
              return
           else if( lquery ) then
              return
           end if
           lorgqr = lwork-iorgqr+1
           lorglq = lwork-iorglq+1
           ! handle four cases separately: r = q, r = p, r = m-p, and r = m-q,
           ! in which r = min(p,m-p,q,m-q)
           if( r == q ) then
              ! case 1: r = q
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_sorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_sorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_sorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 v1t(1_${ik}$,1_${ik}$) = one
                 do j = 2, q
                    v1t(1_${ik}$,j) = zero
                    v1t(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_slacpy( 'U', q-1, q-1, x21(1_${ik}$,2_${ik}$), ldx21, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 call stdlib${ii}$_sorglq( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorglq), &
                           lorglq, childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_sbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,work(iphi), u1, &
              ldu1, u2, ldu2, v1t, ldv1t,dum2, 1_${ik}$, work(ib11d), work(ib11e), work(ib12d),work(&
              ib12e), work(ib21d), work(ib21e),work(ib22d), work(ib22e), work(ibbcsd), lbbcsd,&
                        childinfo )
              ! permute rows and columns to place zero submatrices in
              ! preferred positions
              if( q > 0_${ik}$ .and. wantu2 ) then
                 do i = 1, q
                    iwork(i) = m - p - q + i
                 end do
                 do i = q + 1, m - p
                    iwork(i) = i - q
                 end do
                 call stdlib${ii}$_slapmt( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           else if( r == p ) then
              ! case 2: r = p
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_sorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 u1(1_${ik}$,1_${ik}$) = one
                 do j = 2, p
                    u1(1_${ik}$,j) = zero
                    u1(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_slacpy( 'L', p-1, p-1, x11(2_${ik}$,1_${ik}$), ldx11, u1(2_${ik}$,2_${ik}$), ldu1 )
                 call stdlib${ii}$_sorgqr( p-1, p-1, p-1, u1(2_${ik}$,2_${ik}$), ldu1, work(itaup1),work(iorgqr), &
                           lorgqr, childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_sorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'U', p, q, x11, ldx11, v1t, ldv1t )
                 call stdlib${ii}$_sorglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_sbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,work(iphi), v1t, &
              ldv1t, dum1, 1_${ik}$, u1, ldu1, u2,ldu2, work(ib11d), work(ib11e), work(ib12d),work(ib12e)&
              , work(ib21d), work(ib21e),work(ib22d), work(ib22e), work(ibbcsd), lbbcsd,childinfo &
                        )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( q > 0_${ik}$ .and. wantu2 ) then
                 do i = 1, q
                    iwork(i) = m - p - q + i
                 end do
                 do i = q + 1, m - p
                    iwork(i) = i - q
                 end do
                 call stdlib${ii}$_slapmt( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           else if( r == m-p ) then
              ! case 3: r = m-p
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_sorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_sorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 u2(1_${ik}$,1_${ik}$) = one
                 do j = 2, m-p
                    u2(1_${ik}$,j) = zero
                    u2(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_slacpy( 'L', m-p-1, m-p-1, x21(2_${ik}$,1_${ik}$), ldx21, u2(2_${ik}$,2_${ik}$),ldu2 )
                 call stdlib${ii}$_sorgqr( m-p-1, m-p-1, m-p-1, u2(2_${ik}$,2_${ik}$), ldu2,work(itaup2), work(iorgqr)&
                           , lorgqr, childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'U', m-p, q, x21, ldx21, v1t, ldv1t )
                 call stdlib${ii}$_sorglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_sbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, work(iphi), &
              dum1, 1_${ik}$, v1t, ldv1t, u2,ldu2, u1, ldu1, work(ib11d), work(ib11e),work(ib12d), work(&
              ib12e), work(ib21d),work(ib21e), work(ib22d), work(ib22e),work(ibbcsd), lbbcsd, &
                        childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( q > r ) then
                 do i = 1, r
                    iwork(i) = q - r + i
                 end do
                 do i = r + 1, q
                    iwork(i) = i - r
                 end do
                 if( wantu1 ) then
                    call stdlib${ii}$_slapmt( .false., p, q, u1, ldu1, iwork )
                 end if
                 if( wantv1t ) then
                    call stdlib${ii}$_slapmr( .false., q, q, v1t, ldv1t, iwork )
                 end if
              end if
           else
              ! case 4: r = m-q
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_sorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
              , work(itaup2),work(itauq1), work(iorbdb), work(iorbdb+m),lorbdb-m, childinfo )
                        
              ! accumulate householder reflectors
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_scopy( m-p, work(iorbdb+p), 1_${ik}$, u2, 1_${ik}$ )
              end if
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_scopy( p, work(iorbdb), 1_${ik}$, u1, 1_${ik}$ )
                 do j = 2, p
                    u1(1_${ik}$,j) = zero
                 end do
                 call stdlib${ii}$_slacpy( 'L', p-1, m-q-1, x11(2_${ik}$,1_${ik}$), ldx11, u1(2_${ik}$,2_${ik}$),ldu1 )
                 call stdlib${ii}$_sorgqr( p, p, m-q, u1, ldu1, work(itaup1),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 do j = 2, m-p
                    u2(1_${ik}$,j) = zero
                 end do
                 call stdlib${ii}$_slacpy( 'L', m-p-1, m-q-1, x21(2_${ik}$,1_${ik}$), ldx21, u2(2_${ik}$,2_${ik}$),ldu2 )
                 call stdlib${ii}$_sorgqr( m-p, m-p, m-q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_slacpy( 'U', m-q, q, x21, ldx21, v1t, ldv1t )
                 call stdlib${ii}$_slacpy( 'U', p-(m-q), q-(m-q), x11(m-q+1,m-q+1), ldx11,v1t(m-q+1,m-q+&
                           1_${ik}$), ldv1t )
                 call stdlib${ii}$_slacpy( 'U', -p+q, q-p, x21(m-q+1,p+1), ldx21,v1t(p+1,p+1), ldv1t )
                           
                 call stdlib${ii}$_sorglq( q, q, q, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_sbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, work(iphi), &
              u2, ldu2, u1, ldu1, dum1, 1_${ik}$,v1t, ldv1t, work(ib11d), work(ib11e), work(ib12d),work(&
              ib12e), work(ib21d), work(ib21e),work(ib22d), work(ib22e), work(ibbcsd), lbbcsd,&
                        childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( p > r ) then
                 do i = 1, r
                    iwork(i) = p - r + i
                 end do
                 do i = r + 1, p
                    iwork(i) = i - r
                 end do
                 if( wantu1 ) then
                    call stdlib${ii}$_slapmt( .false., p, p, u1, ldu1, iwork )
                 end if
                 if( wantv1t ) then
                    call stdlib${ii}$_slapmr( .false., p, q, v1t, ldv1t, iwork )
                 end if
              end if
           end if
           return
     end subroutine stdlib${ii}$_sorcsd2by1

     module subroutine stdlib${ii}$_dorcsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta, &
     !! DORCSD2BY1 computes the CS decomposition of an M-by-Q matrix X with
     !! orthonormal columns that has been partitioned into a 2-by-1 block
     !! structure:
     !! [  I1 0  0 ]
     !! [  0  C  0 ]
     !! [ X11 ]   [ U1 |    ] [  0  0  0 ]
     !! X = [-----] = [---------] [----------] V1**T .
     !! [ X21 ]   [    | U2 ] [  0  0  0 ]
     !! [  0  S  0 ]
     !! [  0  0  I2]
     !! X11 is P-by-Q. The orthogonal matrices U1, U2, and V1 are P-by-P,
     !! (M-P)-by-(M-P), and Q-by-Q, respectively. C and S are R-by-R
     !! nonnegative diagonal matrices satisfying C^2 + S^2 = I, in which
     !! R = MIN(P,M-P,Q,M-Q). I1 is a K1-by-K1 identity matrix and I2 is a
     !! K2-by-K2 identity matrix, where K1 = MAX(Q+P-M,0), K2 = MAX(Q-P,0).
               u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, iwork, info )
        ! -- lapack computational routine (3.5.0_dp) --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, lwork, ldx11, ldx21, m, p, q
           ! Array Arguments 
           real(dp), intent(out) :: theta(*)
           real(dp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), work(*)
           real(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
           integer(${ik}$), intent(out) :: iwork(*)
        ! =====================================================================
           
           ! Local Scalars 
           integer(${ik}$) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, j, lbbcsd, lorbdb, &
           lorglq, lorglqmin, lorglqopt, lorgqr, lorgqrmin, lorgqropt, lworkmin, lworkopt, &
                     r
           logical(lk) :: lquery, wantu1, wantu2, wantv1t
           ! Local Arrays 
           real(dp) :: dum1(1_${ik}$), dum2(1_${ik}$,1_${ik}$)
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -4_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -5_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -6_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -8_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -10_${ik}$
           else if( wantu1 .and. ldu1 < max( 1_${ik}$, p ) ) then
              info = -13_${ik}$
           else if( wantu2 .and. ldu2 < max( 1_${ik}$, m - p ) ) then
              info = -15_${ik}$
           else if( wantv1t .and. ldv1t < max( 1_${ik}$, q ) ) then
              info = -17_${ik}$
           end if
           r = min( p, m-p, q, m-q )
           ! compute workspace
             ! work layout:
           ! |-------------------------------------------------------|
           ! | lworkopt (1)                                          |
           ! |-------------------------------------------------------|
           ! | phi (max(1,r-1))                                      |
           ! |-------------------------------------------------------|
           ! | taup1 (max(1,p))                        | b11d (r)    |
           ! | taup2 (max(1,m-p))                      | b11e (r-1)  |
           ! | tauq1 (max(1,q))                        | b12d (r)    |
           ! |-----------------------------------------| b12e (r-1)  |
           ! | stdlib${ii}$_dorbdb work | stdlib${ii}$_dorgqr work | stdlib${ii}$_dorglq work | b21d (r)    |
           ! |             |             |             | b21e (r-1)  |
           ! |             |             |             | b22d (r)    |
           ! |             |             |             | b22e (r-1)  |
           ! |             |             |             | stdlib${ii}$_dbbcsd work |
           ! |-------------------------------------------------------|
           if( info == 0_${ik}$ ) then
              iphi = 2_${ik}$
              ib11d = iphi + max( 1_${ik}$, r-1 )
              ib11e = ib11d + max( 1_${ik}$, r )
              ib12d = ib11e + max( 1_${ik}$, r - 1_${ik}$ )
              ib12e = ib12d + max( 1_${ik}$, r )
              ib21d = ib12e + max( 1_${ik}$, r - 1_${ik}$ )
              ib21e = ib21d + max( 1_${ik}$, r )
              ib22d = ib21e + max( 1_${ik}$, r - 1_${ik}$ )
              ib22e = ib22d + max( 1_${ik}$, r )
              ibbcsd = ib22e + max( 1_${ik}$, r - 1_${ik}$ )
              itaup1 = iphi + max( 1_${ik}$, r-1 )
              itaup2 = itaup1 + max( 1_${ik}$, p )
              itauq1 = itaup2 + max( 1_${ik}$, m-p )
              iorbdb = itauq1 + max( 1_${ik}$, q )
              iorgqr = itauq1 + max( 1_${ik}$, q )
              iorglq = itauq1 + max( 1_${ik}$, q )
              lorgqrmin = 1_${ik}$
              lorgqropt = 1_${ik}$
              lorglqmin = 1_${ik}$
              lorglqopt = 1_${ik}$
              if( r == q ) then
                 call stdlib${ii}$_dorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
                           dum1, work,-1_${ik}$, childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_dorgqr( p, p, q, u1, ldu1, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 endif
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_dorgqr( m-p, m-p, q, u2, ldu2, dum1, work(1_${ik}$),-1_${ik}$, childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_dorglq( q-1, q-1, q-1, v1t, ldv1t,dum1, work(1_${ik}$), -1_${ik}$, childinfo )
                              
                    lorglqmin = max( lorglqmin, q-1 )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_dbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,dum1, u1, &
                 ldu1, u2, ldu2, v1t, ldv1t,dum2, 1_${ik}$, dum1, dum1, dum1,dum1, dum1, dum1, dum1,dum1,&
                            work(1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( work(1_${ik}$),KIND=${ik}$)
              else if( r == p ) then
                 call stdlib${ii}$_dorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
                           dum1,work(1_${ik}$), -1_${ik}$, childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_dorgqr( p-1, p-1, p-1, u1(2_${ik}$,2_${ik}$), ldu1, dum1,work(1_${ik}$), -1_${ik}$, childinfo &
                              )
                    lorgqrmin = max( lorgqrmin, p-1 )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_dorgqr( m-p, m-p, q, u2, ldu2, dum1, work(1_${ik}$),-1_${ik}$, childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_dorglq( q, q, r, v1t, ldv1t, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_dbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,dum1, v1t, &
                 ldv1t, dum2, 1_${ik}$, u1, ldu1,u2, ldu2, dum1, dum1, dum1,dum1, dum1, dum1, dum1,dum1, &
                           work(1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( work(1_${ik}$),KIND=${ik}$)
              else if( r == m-p ) then
                 call stdlib${ii}$_dorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
                           dum1,work(1_${ik}$), -1_${ik}$, childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_dorgqr( p, p, q, u1, ldu1, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_dorgqr( m-p-1, m-p-1, m-p-1, u2(2_${ik}$,2_${ik}$), ldu2,dum1, work(1_${ik}$), -1_${ik}$, &
                              childinfo )
                    lorgqrmin = max( lorgqrmin, m-p-1 )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_dorglq( q, q, r, v1t, ldv1t, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_dbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, dum1, &
                 dum2, 1_${ik}$, v1t, ldv1t, u2,ldu2, u1, ldu1, dum1, dum1, dum1,dum1, dum1, dum1, dum1,&
                           dum1, work(1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( work(1_${ik}$),KIND=${ik}$)
              else
                 call stdlib${ii}$_dorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
                           dum1,dum1, work(1_${ik}$), -1_${ik}$, childinfo )
                 lorbdb = m + int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_dorgqr( p, p, m-q, u1, ldu1, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_dorgqr( m-p, m-p, m-q, u2, ldu2, dum1, work(1_${ik}$),-1_${ik}$, childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_dorglq( q, q, q, v1t, ldv1t, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_dbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, dum1, u2, &
                 ldu2, u1, ldu1, dum2,1_${ik}$, v1t, ldv1t, dum1, dum1, dum1,dum1, dum1, dum1, dum1,dum1,&
                            work(1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( work(1_${ik}$),KIND=${ik}$)
              end if
              lworkmin = max( iorbdb+lorbdb-1,iorgqr+lorgqrmin-1,iorglq+lorglqmin-1,ibbcsd+lbbcsd-&
                        1_${ik}$ )
              lworkopt = max( iorbdb+lorbdb-1,iorgqr+lorgqropt-1,iorglq+lorglqopt-1,ibbcsd+lbbcsd-&
                        1_${ik}$ )
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                 info = -19_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'DORCSD2BY1', -info )
              return
           else if( lquery ) then
              return
           end if
           lorgqr = lwork-iorgqr+1
           lorglq = lwork-iorglq+1
           ! handle four cases separately: r = q, r = p, r = m-p, and r = m-q,
           ! in which r = min(p,m-p,q,m-q)
           if( r == q ) then
              ! case 1: r = q
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_dorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_dorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_dorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 v1t(1_${ik}$,1_${ik}$) = one
                 do j = 2, q
                    v1t(1_${ik}$,j) = zero
                    v1t(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_dlacpy( 'U', q-1, q-1, x21(1_${ik}$,2_${ik}$), ldx21, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 call stdlib${ii}$_dorglq( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorglq), &
                           lorglq, childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_dbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,work(iphi), u1, &
              ldu1, u2, ldu2, v1t, ldv1t,dum2, 1_${ik}$, work(ib11d), work(ib11e),work(ib12d), work(&
              ib12e), work(ib21d),work(ib21e), work(ib22d), work(ib22e),work(ibbcsd), lbbcsd, &
                        childinfo )
              ! permute rows and columns to place zero submatrices in
              ! preferred positions
              if( q > 0_${ik}$ .and. wantu2 ) then
                 do i = 1, q
                    iwork(i) = m - p - q + i
                 end do
                 do i = q + 1, m - p
                    iwork(i) = i - q
                 end do
                 call stdlib${ii}$_dlapmt( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           else if( r == p ) then
              ! case 2: r = p
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_dorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 u1(1_${ik}$,1_${ik}$) = one
                 do j = 2, p
                    u1(1_${ik}$,j) = zero
                    u1(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_dlacpy( 'L', p-1, p-1, x11(2_${ik}$,1_${ik}$), ldx11, u1(2_${ik}$,2_${ik}$), ldu1 )
                 call stdlib${ii}$_dorgqr( p-1, p-1, p-1, u1(2_${ik}$,2_${ik}$), ldu1, work(itaup1),work(iorgqr), &
                           lorgqr, childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_dorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'U', p, q, x11, ldx11, v1t, ldv1t )
                 call stdlib${ii}$_dorglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_dbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,work(iphi), v1t, &
              ldv1t, dum2, 1_${ik}$, u1, ldu1, u2,ldu2, work(ib11d), work(ib11e), work(ib12d),work(ib12e)&
              , work(ib21d), work(ib21e),work(ib22d), work(ib22e), work(ibbcsd), lbbcsd,childinfo &
                        )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( q > 0_${ik}$ .and. wantu2 ) then
                 do i = 1, q
                    iwork(i) = m - p - q + i
                 end do
                 do i = q + 1, m - p
                    iwork(i) = i - q
                 end do
                 call stdlib${ii}$_dlapmt( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           else if( r == m-p ) then
              ! case 3: r = m-p
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_dorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_dorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 u2(1_${ik}$,1_${ik}$) = one
                 do j = 2, m-p
                    u2(1_${ik}$,j) = zero
                    u2(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_dlacpy( 'L', m-p-1, m-p-1, x21(2_${ik}$,1_${ik}$), ldx21, u2(2_${ik}$,2_${ik}$),ldu2 )
                 call stdlib${ii}$_dorgqr( m-p-1, m-p-1, m-p-1, u2(2_${ik}$,2_${ik}$), ldu2,work(itaup2), work(iorgqr)&
                           , lorgqr, childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'U', m-p, q, x21, ldx21, v1t, ldv1t )
                 call stdlib${ii}$_dorglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_dbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, work(iphi), &
              dum2, 1_${ik}$, v1t, ldv1t, u2,ldu2, u1, ldu1, work(ib11d), work(ib11e),work(ib12d), work(&
              ib12e), work(ib21d),work(ib21e), work(ib22d), work(ib22e),work(ibbcsd), lbbcsd, &
                        childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( q > r ) then
                 do i = 1, r
                    iwork(i) = q - r + i
                 end do
                 do i = r + 1, q
                    iwork(i) = i - r
                 end do
                 if( wantu1 ) then
                    call stdlib${ii}$_dlapmt( .false., p, q, u1, ldu1, iwork )
                 end if
                 if( wantv1t ) then
                    call stdlib${ii}$_dlapmr( .false., q, q, v1t, ldv1t, iwork )
                 end if
              end if
           else
              ! case 4: r = m-q
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_dorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
              , work(itaup2),work(itauq1), work(iorbdb), work(iorbdb+m),lorbdb-m, childinfo )
                        
              ! accumulate householder reflectors
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_dcopy( m-p, work(iorbdb+p), 1_${ik}$, u2, 1_${ik}$ )
              end if
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_dcopy( p, work(iorbdb), 1_${ik}$, u1, 1_${ik}$ )
                 do j = 2, p
                    u1(1_${ik}$,j) = zero
                 end do
                 call stdlib${ii}$_dlacpy( 'L', p-1, m-q-1, x11(2_${ik}$,1_${ik}$), ldx11, u1(2_${ik}$,2_${ik}$),ldu1 )
                 call stdlib${ii}$_dorgqr( p, p, m-q, u1, ldu1, work(itaup1),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 do j = 2, m-p
                    u2(1_${ik}$,j) = zero
                 end do
                 call stdlib${ii}$_dlacpy( 'L', m-p-1, m-q-1, x21(2_${ik}$,1_${ik}$), ldx21, u2(2_${ik}$,2_${ik}$),ldu2 )
                 call stdlib${ii}$_dorgqr( m-p, m-p, m-q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_dlacpy( 'U', m-q, q, x21, ldx21, v1t, ldv1t )
                 call stdlib${ii}$_dlacpy( 'U', p-(m-q), q-(m-q), x11(m-q+1,m-q+1), ldx11,v1t(m-q+1,m-q+&
                           1_${ik}$), ldv1t )
                 call stdlib${ii}$_dlacpy( 'U', -p+q, q-p, x21(m-q+1,p+1), ldx21,v1t(p+1,p+1), ldv1t )
                           
                 call stdlib${ii}$_dorglq( q, q, q, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_dbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, work(iphi), &
              u2, ldu2, u1, ldu1, dum2,1_${ik}$, v1t, ldv1t, work(ib11d), work(ib11e),work(ib12d), work(&
              ib12e), work(ib21d),work(ib21e), work(ib22d), work(ib22e),work(ibbcsd), lbbcsd, &
                        childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( p > r ) then
                 do i = 1, r
                    iwork(i) = p - r + i
                 end do
                 do i = r + 1, p
                    iwork(i) = i - r
                 end do
                 if( wantu1 ) then
                    call stdlib${ii}$_dlapmt( .false., p, p, u1, ldu1, iwork )
                 end if
                 if( wantv1t ) then
                    call stdlib${ii}$_dlapmr( .false., p, q, v1t, ldv1t, iwork )
                 end if
              end if
           end if
           return
     end subroutine stdlib${ii}$_dorcsd2by1

#:for rk,rt,ri in REAL_KINDS_TYPES
#:if not rk in ["sp","dp"]
     module subroutine stdlib${ii}$_${ri}$orcsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta, &
     !! DORCSD2BY1: computes the CS decomposition of an M-by-Q matrix X with
     !! orthonormal columns that has been partitioned into a 2-by-1 block
     !! structure:
     !! [  I1 0  0 ]
     !! [  0  C  0 ]
     !! [ X11 ]   [ U1 |    ] [  0  0  0 ]
     !! X = [-----] = [---------] [----------] V1**T .
     !! [ X21 ]   [    | U2 ] [  0  0  0 ]
     !! [  0  S  0 ]
     !! [  0  0  I2]
     !! X11 is P-by-Q. The orthogonal matrices U1, U2, and V1 are P-by-P,
     !! (M-P)-by-(M-P), and Q-by-Q, respectively. C and S are R-by-R
     !! nonnegative diagonal matrices satisfying C^2 + S^2 = I, in which
     !! R = MIN(P,M-P,Q,M-Q). I1 is a K1-by-K1 identity matrix and I2 is a
     !! K2-by-K2 identity matrix, where K1 = MAX(Q+P-M,0), K2 = MAX(Q-P,0).
               u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, iwork, info )
        ! -- lapack computational routine (3.5.0_${rk}$) --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${rk}$, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, lwork, ldx11, ldx21, m, p, q
           ! Array Arguments 
           real(${rk}$), intent(out) :: theta(*)
           real(${rk}$), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), work(*)
           real(${rk}$), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
           integer(${ik}$), intent(out) :: iwork(*)
        ! =====================================================================
           
           ! Local Scalars 
           integer(${ik}$) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, j, lbbcsd, lorbdb, &
           lorglq, lorglqmin, lorglqopt, lorgqr, lorgqrmin, lorgqropt, lworkmin, lworkopt, &
                     r
           logical(lk) :: lquery, wantu1, wantu2, wantv1t
           ! Local Arrays 
           real(${rk}$) :: dum1(1_${ik}$), dum2(1_${ik}$,1_${ik}$)
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -4_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -5_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -6_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -8_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -10_${ik}$
           else if( wantu1 .and. ldu1 < max( 1_${ik}$, p ) ) then
              info = -13_${ik}$
           else if( wantu2 .and. ldu2 < max( 1_${ik}$, m - p ) ) then
              info = -15_${ik}$
           else if( wantv1t .and. ldv1t < max( 1_${ik}$, q ) ) then
              info = -17_${ik}$
           end if
           r = min( p, m-p, q, m-q )
           ! compute workspace
             ! work layout:
           ! |-------------------------------------------------------|
           ! | lworkopt (1)                                          |
           ! |-------------------------------------------------------|
           ! | phi (max(1,r-1))                                      |
           ! |-------------------------------------------------------|
           ! | taup1 (max(1,p))                        | b11d (r)    |
           ! | taup2 (max(1,m-p))                      | b11e (r-1)  |
           ! | tauq1 (max(1,q))                        | b12d (r)    |
           ! |-----------------------------------------| b12e (r-1)  |
           ! | stdlib${ii}$_${ri}$orbdb work | stdlib${ii}$_${ri}$orgqr work | stdlib${ii}$_${ri}$orglq work | b21d (r)    |
           ! |             |             |             | b21e (r-1)  |
           ! |             |             |             | b22d (r)    |
           ! |             |             |             | b22e (r-1)  |
           ! |             |             |             | stdlib${ii}$_${ri}$bbcsd work |
           ! |-------------------------------------------------------|
           if( info == 0_${ik}$ ) then
              iphi = 2_${ik}$
              ib11d = iphi + max( 1_${ik}$, r-1 )
              ib11e = ib11d + max( 1_${ik}$, r )
              ib12d = ib11e + max( 1_${ik}$, r - 1_${ik}$ )
              ib12e = ib12d + max( 1_${ik}$, r )
              ib21d = ib12e + max( 1_${ik}$, r - 1_${ik}$ )
              ib21e = ib21d + max( 1_${ik}$, r )
              ib22d = ib21e + max( 1_${ik}$, r - 1_${ik}$ )
              ib22e = ib22d + max( 1_${ik}$, r )
              ibbcsd = ib22e + max( 1_${ik}$, r - 1_${ik}$ )
              itaup1 = iphi + max( 1_${ik}$, r-1 )
              itaup2 = itaup1 + max( 1_${ik}$, p )
              itauq1 = itaup2 + max( 1_${ik}$, m-p )
              iorbdb = itauq1 + max( 1_${ik}$, q )
              iorgqr = itauq1 + max( 1_${ik}$, q )
              iorglq = itauq1 + max( 1_${ik}$, q )
              lorgqrmin = 1_${ik}$
              lorgqropt = 1_${ik}$
              lorglqmin = 1_${ik}$
              lorglqopt = 1_${ik}$
              if( r == q ) then
                 call stdlib${ii}$_${ri}$orbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
                           dum1, work,-1_${ik}$, childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ri}$orgqr( p, p, q, u1, ldu1, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 endif
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ri}$orgqr( m-p, m-p, q, u2, ldu2, dum1, work(1_${ik}$),-1_${ik}$, childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_${ri}$orglq( q-1, q-1, q-1, v1t, ldv1t,dum1, work(1_${ik}$), -1_${ik}$, childinfo )
                              
                    lorglqmin = max( lorglqmin, q-1 )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_${ri}$bbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,dum1, u1, &
                 ldu1, u2, ldu2, v1t, ldv1t,dum2, 1_${ik}$, dum1, dum1, dum1,dum1, dum1, dum1, dum1,dum1,&
                            work(1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( work(1_${ik}$),KIND=${ik}$)
              else if( r == p ) then
                 call stdlib${ii}$_${ri}$orbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
                           dum1,work(1_${ik}$), -1_${ik}$, childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ri}$orgqr( p-1, p-1, p-1, u1(2_${ik}$,2_${ik}$), ldu1, dum1,work(1_${ik}$), -1_${ik}$, childinfo &
                              )
                    lorgqrmin = max( lorgqrmin, p-1 )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ri}$orgqr( m-p, m-p, q, u2, ldu2, dum1, work(1_${ik}$),-1_${ik}$, childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_${ri}$orglq( q, q, r, v1t, ldv1t, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_${ri}$bbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,dum1, v1t, &
                 ldv1t, dum2, 1_${ik}$, u1, ldu1,u2, ldu2, dum1, dum1, dum1,dum1, dum1, dum1, dum1,dum1, &
                           work(1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( work(1_${ik}$),KIND=${ik}$)
              else if( r == m-p ) then
                 call stdlib${ii}$_${ri}$orbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
                           dum1,work(1_${ik}$), -1_${ik}$, childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ri}$orgqr( p, p, q, u1, ldu1, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ri}$orgqr( m-p-1, m-p-1, m-p-1, u2(2_${ik}$,2_${ik}$), ldu2,dum1, work(1_${ik}$), -1_${ik}$, &
                              childinfo )
                    lorgqrmin = max( lorgqrmin, m-p-1 )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_${ri}$orglq( q, q, r, v1t, ldv1t, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_${ri}$bbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, dum1, &
                 dum2, 1_${ik}$, v1t, ldv1t, u2,ldu2, u1, ldu1, dum1, dum1, dum1,dum1, dum1, dum1, dum1,&
                           dum1, work(1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( work(1_${ik}$),KIND=${ik}$)
              else
                 call stdlib${ii}$_${ri}$orbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
                           dum1,dum1, work(1_${ik}$), -1_${ik}$, childinfo )
                 lorbdb = m + int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ri}$orgqr( p, p, m-q, u1, ldu1, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ri}$orgqr( m-p, m-p, m-q, u2, ldu2, dum1, work(1_${ik}$),-1_${ik}$, childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_${ri}$orglq( q, q, q, v1t, ldv1t, dum1, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_${ri}$bbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, dum1, u2, &
                 ldu2, u1, ldu1, dum2,1_${ik}$, v1t, ldv1t, dum1, dum1, dum1,dum1, dum1, dum1, dum1,dum1,&
                            work(1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( work(1_${ik}$),KIND=${ik}$)
              end if
              lworkmin = max( iorbdb+lorbdb-1,iorgqr+lorgqrmin-1,iorglq+lorglqmin-1,ibbcsd+lbbcsd-&
                        1_${ik}$ )
              lworkopt = max( iorbdb+lorbdb-1,iorgqr+lorgqropt-1,iorglq+lorglqopt-1,ibbcsd+lbbcsd-&
                        1_${ik}$ )
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                 info = -19_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'DORCSD2BY1', -info )
              return
           else if( lquery ) then
              return
           end if
           lorgqr = lwork-iorgqr+1
           lorglq = lwork-iorglq+1
           ! handle four cases separately: r = q, r = p, r = m-p, and r = m-q,
           ! in which r = min(p,m-p,q,m-q)
           if( r == q ) then
              ! case 1: r = q
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_${ri}$orbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_${ri}$orgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_${ri}$orgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 v1t(1_${ik}$,1_${ik}$) = one
                 do j = 2, q
                    v1t(1_${ik}$,j) = zero
                    v1t(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_${ri}$lacpy( 'U', q-1, q-1, x21(1_${ik}$,2_${ik}$), ldx21, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 call stdlib${ii}$_${ri}$orglq( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorglq), &
                           lorglq, childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_${ri}$bbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,work(iphi), u1, &
              ldu1, u2, ldu2, v1t, ldv1t,dum2, 1_${ik}$, work(ib11d), work(ib11e),work(ib12d), work(&
              ib12e), work(ib21d),work(ib21e), work(ib22d), work(ib22e),work(ibbcsd), lbbcsd, &
                        childinfo )
              ! permute rows and columns to place zero submatrices in
              ! preferred positions
              if( q > 0_${ik}$ .and. wantu2 ) then
                 do i = 1, q
                    iwork(i) = m - p - q + i
                 end do
                 do i = q + 1, m - p
                    iwork(i) = i - q
                 end do
                 call stdlib${ii}$_${ri}$lapmt( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           else if( r == p ) then
              ! case 2: r = p
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_${ri}$orbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 u1(1_${ik}$,1_${ik}$) = one
                 do j = 2, p
                    u1(1_${ik}$,j) = zero
                    u1(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_${ri}$lacpy( 'L', p-1, p-1, x11(2_${ik}$,1_${ik}$), ldx11, u1(2_${ik}$,2_${ik}$), ldu1 )
                 call stdlib${ii}$_${ri}$orgqr( p-1, p-1, p-1, u1(2_${ik}$,2_${ik}$), ldu1, work(itaup1),work(iorgqr), &
                           lorgqr, childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_${ri}$orgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'U', p, q, x11, ldx11, v1t, ldv1t )
                 call stdlib${ii}$_${ri}$orglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_${ri}$bbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,work(iphi), v1t, &
              ldv1t, dum2, 1_${ik}$, u1, ldu1, u2,ldu2, work(ib11d), work(ib11e), work(ib12d),work(ib12e)&
              , work(ib21d), work(ib21e),work(ib22d), work(ib22e), work(ibbcsd), lbbcsd,childinfo &
                        )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( q > 0_${ik}$ .and. wantu2 ) then
                 do i = 1, q
                    iwork(i) = m - p - q + i
                 end do
                 do i = q + 1, m - p
                    iwork(i) = i - q
                 end do
                 call stdlib${ii}$_${ri}$lapmt( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           else if( r == m-p ) then
              ! case 3: r = m-p
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_${ri}$orbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_${ri}$orgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 u2(1_${ik}$,1_${ik}$) = one
                 do j = 2, m-p
                    u2(1_${ik}$,j) = zero
                    u2(j,1_${ik}$) = zero
                 end do
                 call stdlib${ii}$_${ri}$lacpy( 'L', m-p-1, m-p-1, x21(2_${ik}$,1_${ik}$), ldx21, u2(2_${ik}$,2_${ik}$),ldu2 )
                 call stdlib${ii}$_${ri}$orgqr( m-p-1, m-p-1, m-p-1, u2(2_${ik}$,2_${ik}$), ldu2,work(itaup2), work(iorgqr)&
                           , lorgqr, childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'U', m-p, q, x21, ldx21, v1t, ldv1t )
                 call stdlib${ii}$_${ri}$orglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_${ri}$bbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, work(iphi), &
              dum2, 1_${ik}$, v1t, ldv1t, u2,ldu2, u1, ldu1, work(ib11d), work(ib11e),work(ib12d), work(&
              ib12e), work(ib21d),work(ib21e), work(ib22d), work(ib22e),work(ibbcsd), lbbcsd, &
                        childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( q > r ) then
                 do i = 1, r
                    iwork(i) = q - r + i
                 end do
                 do i = r + 1, q
                    iwork(i) = i - r
                 end do
                 if( wantu1 ) then
                    call stdlib${ii}$_${ri}$lapmt( .false., p, q, u1, ldu1, iwork )
                 end if
                 if( wantv1t ) then
                    call stdlib${ii}$_${ri}$lapmr( .false., q, q, v1t, ldv1t, iwork )
                 end if
              end if
           else
              ! case 4: r = m-q
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_${ri}$orbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
              , work(itaup2),work(itauq1), work(iorbdb), work(iorbdb+m),lorbdb-m, childinfo )
                        
              ! accumulate householder reflectors
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$copy( m-p, work(iorbdb+p), 1_${ik}$, u2, 1_${ik}$ )
              end if
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$copy( p, work(iorbdb), 1_${ik}$, u1, 1_${ik}$ )
                 do j = 2, p
                    u1(1_${ik}$,j) = zero
                 end do
                 call stdlib${ii}$_${ri}$lacpy( 'L', p-1, m-q-1, x11(2_${ik}$,1_${ik}$), ldx11, u1(2_${ik}$,2_${ik}$),ldu1 )
                 call stdlib${ii}$_${ri}$orgqr( p, p, m-q, u1, ldu1, work(itaup1),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 do j = 2, m-p
                    u2(1_${ik}$,j) = zero
                 end do
                 call stdlib${ii}$_${ri}$lacpy( 'L', m-p-1, m-q-1, x21(2_${ik}$,1_${ik}$), ldx21, u2(2_${ik}$,2_${ik}$),ldu2 )
                 call stdlib${ii}$_${ri}$orgqr( m-p, m-p, m-q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lacpy( 'U', m-q, q, x21, ldx21, v1t, ldv1t )
                 call stdlib${ii}$_${ri}$lacpy( 'U', p-(m-q), q-(m-q), x11(m-q+1,m-q+1), ldx11,v1t(m-q+1,m-q+&
                           1_${ik}$), ldv1t )
                 call stdlib${ii}$_${ri}$lacpy( 'U', -p+q, q-p, x21(m-q+1,p+1), ldx21,v1t(p+1,p+1), ldv1t )
                           
                 call stdlib${ii}$_${ri}$orglq( q, q, q, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_${ri}$bbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, work(iphi), &
              u2, ldu2, u1, ldu1, dum2,1_${ik}$, v1t, ldv1t, work(ib11d), work(ib11e),work(ib12d), work(&
              ib12e), work(ib21d),work(ib21e), work(ib22d), work(ib22e),work(ibbcsd), lbbcsd, &
                        childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( p > r ) then
                 do i = 1, r
                    iwork(i) = p - r + i
                 end do
                 do i = r + 1, p
                    iwork(i) = i - r
                 end do
                 if( wantu1 ) then
                    call stdlib${ii}$_${ri}$lapmt( .false., p, p, u1, ldu1, iwork )
                 end if
                 if( wantv1t ) then
                    call stdlib${ii}$_${ri}$lapmr( .false., p, q, v1t, ldv1t, iwork )
                 end if
              end if
           end if
           return
     end subroutine stdlib${ii}$_${ri}$orcsd2by1

#:endif
#:endfor



     module subroutine stdlib${ii}$_sorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
     !! SORBDB simultaneously bidiagonalizes the blocks of an M-by-M
     !! partitioned orthogonal matrix X:
     !! [ B11 | B12 0  0 ]
     !! [ X11 | X12 ]   [ P1 |    ] [  0  |  0 -I  0 ] [ Q1 |    ]**T
     !! X = [-----------] = [---------] [----------------] [---------]   .
     !! [ X21 | X22 ]   [    | P2 ] [ B21 | B22 0  0 ] [    | Q2 ]
     !! [  0  |  0  0  I ]
     !! X11 is P-by-Q. Q must be no larger than P, M-P, or M-Q. (If this is
     !! not the case, then X must be transposed and/or permuted. This can be
     !! done in constant time using the TRANS and SIGNS options. See SORCSD
     !! for details.)
     !! The orthogonal matrices P1, P2, Q1, and Q2 are P-by-P, (M-P)-by-
     !! (M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. They are
     !! represented implicitly by Householder vectors.
     !! B11, B12, B21, and B22 are Q-by-Q bidiagonal matrices represented
     !! implicitly by angles THETA, PHI.
               ldx22, theta, phi, taup1,taup2, tauq1, tauq2, work, lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: signs, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldx11, ldx12, ldx21, ldx22, lwork, m, p, q
           ! Array Arguments 
           real(sp), intent(out) :: phi(*), theta(*)
           real(sp), intent(out) :: taup1(*), taup2(*), tauq1(*), tauq2(*), work(*)
           real(sp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
                     
        ! ====================================================================
           ! Parameters 
           
           
           
           ! Local Scalars 
           logical(lk) :: colmajor, lquery
           integer(${ik}$) :: i, lworkmin, lworkopt
           real(sp) :: z1, z2, z3, z4
           ! Intrinsic Functions
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           colmajor = .not. stdlib_lsame( trans, 'T' )
           if( .not. stdlib_lsame( signs, 'O' ) ) then
              z1 = one
              z2 = one
              z3 = one
              z4 = one
           else
              z1 = one
              z2 = -one
              z3 = one
              z4 = -one
           end if
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -3_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -4_${ik}$
           else if( q < 0_${ik}$ .or. q > p .or. q > m-p .or.q > m-q ) then
              info = -5_${ik}$
           else if( colmajor .and. ldx11 < max( 1_${ik}$, p ) ) then
              info = -7_${ik}$
           else if( .not.colmajor .and. ldx11 < max( 1_${ik}$, q ) ) then
              info = -7_${ik}$
           else if( colmajor .and. ldx12 < max( 1_${ik}$, p ) ) then
              info = -9_${ik}$
           else if( .not.colmajor .and. ldx12 < max( 1_${ik}$, m-q ) ) then
              info = -9_${ik}$
           else if( colmajor .and. ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -11_${ik}$
           else if( .not.colmajor .and. ldx21 < max( 1_${ik}$, q ) ) then
              info = -11_${ik}$
           else if( colmajor .and. ldx22 < max( 1_${ik}$, m-p ) ) then
              info = -13_${ik}$
           else if( .not.colmajor .and. ldx22 < max( 1_${ik}$, m-q )