stdlib_lapack_cosine_sine.fypp Source File


Source Code

#:include "common.fypp" 
submodule(stdlib_lapack_eig_svd_lsq) stdlib_lapack_cosine_sine
  implicit none


  contains
#:for ik,it,ii in LINALG_INT_KINDS_TYPES

     pure module subroutine stdlib${ii}$_sbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
     !! SBBCSD computes the CS decomposition of an orthogonal matrix in
     !! bidiagonal-block form,
     !! [ B11 | B12 0  0 ]
     !! [  0  |  0 -I  0 ]
     !! X = [----------------]
     !! [ B21 | B22 0  0 ]
     !! [  0  |  0  0  I ]
     !! [  C | -S  0  0 ]
     !! [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**T
     !! = [---------] [---------------] [---------]   .
     !! [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
     !! [  0 |  0  0  I ]
     !! X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
     !! than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
     !! transposed and/or permuted. This can be done in constant time using
     !! the TRANS and SIGNS options. See SORCSD for details.)
     !! The bidiagonal matrices B11, B12, B21, and B22 are represented
     !! implicitly by angles THETA(1:Q) and PHI(1:Q-1).
     !! The orthogonal matrices U1, U2, V1T, and V2T are input/output.
     !! The input matrices are pre- or post-multiplied by the appropriate
     !! singular vector matrices.
     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, work, &
               lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, one, ten, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lwork, m, p, q
           ! Array Arguments 
           real(sp), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
                      b22e(*), work(*)
           real(sp), intent(inout) :: phi(*), theta(*)
           real(sp), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
        ! ===================================================================
           ! Parameters 
           integer(${ik}$), parameter :: maxitr = 6_${ik}$
           real(sp), parameter :: hundred = 100.0_sp
           real(sp), parameter :: meighth = -0.125_sp
           real(sp), parameter :: piover2 = 1.57079632679489661923132169163975144210_sp
           
           
           
           
           ! Local Scalars 
           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
                     wantu2, wantv1t, wantv2t
           integer(${ik}$) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
                     iv2tcs, iv2tsn, j, lworkmin, lworkopt, maxit, mini
           real(sp) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
           ! Intrinsic Functions 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           if( m < 0_${ik}$ ) then
              info = -6_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -7_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -8_${ik}$
           else if( q > p .or. q > m-p .or. q > m-q ) then
              info = -8_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -12_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -14_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -16_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -18_${ik}$
           end if
           ! quick return if q = 0
           if( info == 0_${ik}$ .and. q == 0_${ik}$ ) then
              lworkmin = 1_${ik}$
              work(1_${ik}$) = lworkmin
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              iu1cs = 1_${ik}$
              iu1sn = iu1cs + q
              iu2cs = iu1sn + q
              iu2sn = iu2cs + q
              iv1tcs = iu2sn + q
              iv1tsn = iv1tcs + q
              iv2tcs = iv1tsn + q
              iv2tsn = iv2tcs + q
              lworkopt = iv2tsn + q - 1_${ik}$
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not. lquery ) then
                 info = -28_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'SBBCSD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! get machine constants
           eps = stdlib${ii}$_slamch( 'EPSILON' )
           unfl = stdlib${ii}$_slamch( 'SAFE MINIMUM' )
           tolmul = max( ten, min( hundred, eps**meighth ) )
           tol = tolmul*eps
           thresh = max( tol, maxitr*q*q*unfl )
           ! test for negligible sines or cosines
           do i = 1, q
              if( theta(i) < thresh ) then
                 theta(i) = zero
              else if( theta(i) > piover2-thresh ) then
                 theta(i) = piover2
              end if
           end do
           do i = 1, q-1
              if( phi(i) < thresh ) then
                 phi(i) = zero
              else if( phi(i) > piover2-thresh ) then
                 phi(i) = piover2
              end if
           end do
           ! initial deflation
           imax = q
           do while( imax > 1 )
              if( phi(imax-1) /= zero ) then
                 exit
              end if
              imax = imax - 1_${ik}$
           end do
           imin = imax - 1_${ik}$
           if  ( imin > 1_${ik}$ ) then
              do while( phi(imin-1) /= zero )
                 imin = imin - 1_${ik}$
                 if  ( imin <= 1 ) exit
              end do
           end if
           ! initialize iteration counter
           maxit = maxitr*q*q
           iter = 0_${ik}$
           ! begin main iteration loop
           do while( imax > 1 )
              ! compute the matrix entries
              b11d(imin) = cos( theta(imin) )
              b21d(imin) = -sin( theta(imin) )
              do i = imin, imax - 1
                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
              end do
              b12d(imax) = sin( theta(imax) )
              b22d(imax) = cos( theta(imax) )
              ! abort if not converging; otherwise, increment iter
              if( iter > maxit ) then
                 info = 0_${ik}$
                 do i = 1, q
                    if( phi(i) /= zero )info = info + 1_${ik}$
                 end do
                 return
              end if
              iter = iter + imax - imin
              ! compute shifts
              thetamax = theta(imin)
              thetamin = theta(imin)
              do i = imin+1, imax
                 if( theta(i) > thetamax )thetamax = theta(i)
                 if( theta(i) < thetamin )thetamin = theta(i)
              end do
              if( thetamax > piover2 - thresh ) then
                 ! zero on diagonals of b11 and b22; induce deflation with a
                 ! zero shift
                 mu = zero
                 nu = one
              else if( thetamin < thresh ) then
                 ! zero on diagonals of b12 and b22; induce deflation with a
                 ! zero shift
                 mu = one
                 nu = zero
              else
                 ! compute shifts for b11 and b21 and use the lesser
                 call stdlib${ii}$_slas2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
                           
                 call stdlib${ii}$_slas2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
                           
                 if( sigma11 <= sigma21 ) then
                    mu = sigma11
                    nu = sqrt( one - mu**2_${ik}$ )
                    if( mu < thresh ) then
                       mu = zero
                       nu = one
                    end if
                 else
                    nu = sigma21
                    mu = sqrt( one - nu**2_${ik}$ )
                    if( nu < thresh ) then
                       mu = one
                       nu = zero
                    end if
                 end if
              end if
              ! rotate to produce bulges in b11 and b21
              if( mu <= nu ) then
                 call stdlib${ii}$_slartgs( b11d(imin), b11e(imin), mu,work(iv1tcs+imin-1), work(iv1tsn+&
                           imin-1) )
              else
                 call stdlib${ii}$_slartgs( b21d(imin), b21e(imin), nu,work(iv1tcs+imin-1), work(iv1tsn+&
                           imin-1) )
              end if
              temp = work(iv1tcs+imin-1)*b11d(imin) +work(iv1tsn+imin-1)*b11e(imin)
              b11e(imin) = work(iv1tcs+imin-1)*b11e(imin) -work(iv1tsn+imin-1)*b11d(imin)
              b11d(imin) = temp
              b11bulge = work(iv1tsn+imin-1)*b11d(imin+1)
              b11d(imin+1) = work(iv1tcs+imin-1)*b11d(imin+1)
              temp = work(iv1tcs+imin-1)*b21d(imin) +work(iv1tsn+imin-1)*b21e(imin)
              b21e(imin) = work(iv1tcs+imin-1)*b21e(imin) -work(iv1tsn+imin-1)*b21d(imin)
              b21d(imin) = temp
              b21bulge = work(iv1tsn+imin-1)*b21d(imin+1)
              b21d(imin+1) = work(iv1tcs+imin-1)*b21d(imin+1)
              ! compute theta(imin)
              theta( imin ) = atan2( sqrt( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ ),sqrt( b11d(imin)**2_${ik}$+&
                        b11bulge**2_${ik}$ ) )
              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
              if( b11d(imin)**2_${ik}$+b11bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_slartgp( b11bulge, b11d(imin), work(iu1sn+imin-1),work(iu1cs+imin-1),&
                            r )
              else if( mu <= nu ) then
                 call stdlib${ii}$_slartgs( b11e( imin ), b11d( imin + 1_${ik}$ ), mu,work(iu1cs+imin-1), work(&
                           iu1sn+imin-1) )
              else
                 call stdlib${ii}$_slartgs( b12d( imin ), b12e( imin ), nu,work(iu1cs+imin-1), work(&
                           iu1sn+imin-1) )
              end if
              if( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_slartgp( b21bulge, b21d(imin), work(iu2sn+imin-1),work(iu2cs+imin-1),&
                            r )
              else if( nu < mu ) then
                 call stdlib${ii}$_slartgs( b21e( imin ), b21d( imin + 1_${ik}$ ), nu,work(iu2cs+imin-1), work(&
                           iu2sn+imin-1) )
              else
                 call stdlib${ii}$_slartgs( b22d(imin), b22e(imin), mu,work(iu2cs+imin-1), work(iu2sn+&
                           imin-1) )
              end if
              work(iu2cs+imin-1) = -work(iu2cs+imin-1)
              work(iu2sn+imin-1) = -work(iu2sn+imin-1)
              temp = work(iu1cs+imin-1)*b11e(imin) +work(iu1sn+imin-1)*b11d(imin+1)
              b11d(imin+1) = work(iu1cs+imin-1)*b11d(imin+1) -work(iu1sn+imin-1)*b11e(imin)
                        
              b11e(imin) = temp
              if( imax > imin+1 ) then
                 b11bulge = work(iu1sn+imin-1)*b11e(imin+1)
                 b11e(imin+1) = work(iu1cs+imin-1)*b11e(imin+1)
              end if
              temp = work(iu1cs+imin-1)*b12d(imin) +work(iu1sn+imin-1)*b12e(imin)
              b12e(imin) = work(iu1cs+imin-1)*b12e(imin) -work(iu1sn+imin-1)*b12d(imin)
              b12d(imin) = temp
              b12bulge = work(iu1sn+imin-1)*b12d(imin+1)
              b12d(imin+1) = work(iu1cs+imin-1)*b12d(imin+1)
              temp = work(iu2cs+imin-1)*b21e(imin) +work(iu2sn+imin-1)*b21d(imin+1)
              b21d(imin+1) = work(iu2cs+imin-1)*b21d(imin+1) -work(iu2sn+imin-1)*b21e(imin)
                        
              b21e(imin) = temp
              if( imax > imin+1 ) then
                 b21bulge = work(iu2sn+imin-1)*b21e(imin+1)
                 b21e(imin+1) = work(iu2cs+imin-1)*b21e(imin+1)
              end if
              temp = work(iu2cs+imin-1)*b22d(imin) +work(iu2sn+imin-1)*b22e(imin)
              b22e(imin) = work(iu2cs+imin-1)*b22e(imin) -work(iu2sn+imin-1)*b22d(imin)
              b22d(imin) = temp
              b22bulge = work(iu2sn+imin-1)*b22d(imin+1)
              b22d(imin+1) = work(iu2cs+imin-1)*b22d(imin+1)
              ! inner loop: chase bulges from b11(imin,imin+2),
              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
              ! bottom-right
              do i = imin+1, imax-1
                 ! compute phi(i-1)
                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
                 phi(i-1) = atan2( sqrt(x1**2_${ik}$+x2**2_${ik}$), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 = b11e(i-1)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 = b21e(i-1)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12d(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22d(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_slartgp( x2, x1, work(iv1tsn+i-1), work(iv1tcs+i-1),r )
                 else if( .not. restart11 .and. restart21 ) then
                    call stdlib${ii}$_slartgp( b11bulge, b11e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
                              r )
                 else if( restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_slartgp( b21bulge, b21e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
                              r )
                 else if( mu <= nu ) then
                    call stdlib${ii}$_slartgs( b11d(i), b11e(i), mu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
                              
                 else
                    call stdlib${ii}$_slartgs( b21d(i), b21e(i), nu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
                              
                 end if
                 work(iv1tcs+i-1) = -work(iv1tcs+i-1)
                 work(iv1tsn+i-1) = -work(iv1tsn+i-1)
                 if( .not. restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( y2, y1, work(iv2tsn+i-1-1),work(iv2tcs+i-1-1), r )
                              
                 else if( .not. restart12 .and. restart22 ) then
                    call stdlib${ii}$_slartgp( b12bulge, b12d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
                              1_${ik}$), r )
                 else if( restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( b22bulge, b22d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
                              1_${ik}$), r )
                 else if( nu < mu ) then
                    call stdlib${ii}$_slartgs( b12e(i-1), b12d(i), nu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
                              1_${ik}$-1) )
                 else
                    call stdlib${ii}$_slartgs( b22e(i-1), b22d(i), mu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
                              1_${ik}$-1) )
                 end if
                 temp = work(iv1tcs+i-1)*b11d(i) + work(iv1tsn+i-1)*b11e(i)
                 b11e(i) = work(iv1tcs+i-1)*b11e(i) -work(iv1tsn+i-1)*b11d(i)
                 b11d(i) = temp
                 b11bulge = work(iv1tsn+i-1)*b11d(i+1)
                 b11d(i+1) = work(iv1tcs+i-1)*b11d(i+1)
                 temp = work(iv1tcs+i-1)*b21d(i) + work(iv1tsn+i-1)*b21e(i)
                 b21e(i) = work(iv1tcs+i-1)*b21e(i) -work(iv1tsn+i-1)*b21d(i)
                 b21d(i) = temp
                 b21bulge = work(iv1tsn+i-1)*b21d(i+1)
                 b21d(i+1) = work(iv1tcs+i-1)*b21d(i+1)
                 temp = work(iv2tcs+i-1-1)*b12e(i-1) +work(iv2tsn+i-1-1)*b12d(i)
                 b12d(i) = work(iv2tcs+i-1-1)*b12d(i) -work(iv2tsn+i-1-1)*b12e(i-1)
                 b12e(i-1) = temp
                 b12bulge = work(iv2tsn+i-1-1)*b12e(i)
                 b12e(i) = work(iv2tcs+i-1-1)*b12e(i)
                 temp = work(iv2tcs+i-1-1)*b22e(i-1) +work(iv2tsn+i-1-1)*b22d(i)
                 b22d(i) = work(iv2tcs+i-1-1)*b22d(i) -work(iv2tsn+i-1-1)*b22e(i-1)
                 b22e(i-1) = temp
                 b22bulge = work(iv2tsn+i-1-1)*b22e(i)
                 b22e(i) = work(iv2tcs+i-1-1)*b22e(i)
                 ! compute theta(i)
                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
                 theta(i) = atan2( sqrt(y1**2_${ik}$+y2**2_${ik}$), sqrt(x1**2_${ik}$+x2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 =   b11d(i)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12e(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 =   b21d(i)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22e(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_slartgp( x2, x1, work(iu1sn+i-1), work(iu1cs+i-1),r )
                 else if( .not. restart11 .and. restart12 ) then
                    call stdlib${ii}$_slartgp( b11bulge, b11d(i), work(iu1sn+i-1),work(iu1cs+i-1), r )
                              
                 else if( restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_slartgp( b12bulge, b12e(i-1), work(iu1sn+i-1),work(iu1cs+i-1), r )
                              
                 else if( mu <= nu ) then
                    call stdlib${ii}$_slartgs( b11e(i), b11d(i+1), mu, work(iu1cs+i-1),work(iu1sn+i-1) )
                              
                 else
                    call stdlib${ii}$_slartgs( b12d(i), b12e(i), nu, work(iu1cs+i-1),work(iu1sn+i-1) )
                              
                 end if
                 if( .not. restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( y2, y1, work(iu2sn+i-1), work(iu2cs+i-1),r )
                 else if( .not. restart21 .and. restart22 ) then
                    call stdlib${ii}$_slartgp( b21bulge, b21d(i), work(iu2sn+i-1),work(iu2cs+i-1), r )
                              
                 else if( restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( b22bulge, b22e(i-1), work(iu2sn+i-1),work(iu2cs+i-1), r )
                              
                 else if( nu < mu ) then
                    call stdlib${ii}$_slartgs( b21e(i), b21e(i+1), nu, work(iu2cs+i-1),work(iu2sn+i-1) )
                              
                 else
                    call stdlib${ii}$_slartgs( b22d(i), b22e(i), mu, work(iu2cs+i-1),work(iu2sn+i-1) )
                              
                 end if
                 work(iu2cs+i-1) = -work(iu2cs+i-1)
                 work(iu2sn+i-1) = -work(iu2sn+i-1)
                 temp = work(iu1cs+i-1)*b11e(i) + work(iu1sn+i-1)*b11d(i+1)
                 b11d(i+1) = work(iu1cs+i-1)*b11d(i+1) -work(iu1sn+i-1)*b11e(i)
                 b11e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b11bulge = work(iu1sn+i-1)*b11e(i+1)
                    b11e(i+1) = work(iu1cs+i-1)*b11e(i+1)
                 end if
                 temp = work(iu2cs+i-1)*b21e(i) + work(iu2sn+i-1)*b21d(i+1)
                 b21d(i+1) = work(iu2cs+i-1)*b21d(i+1) -work(iu2sn+i-1)*b21e(i)
                 b21e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b21bulge = work(iu2sn+i-1)*b21e(i+1)
                    b21e(i+1) = work(iu2cs+i-1)*b21e(i+1)
                 end if
                 temp = work(iu1cs+i-1)*b12d(i) + work(iu1sn+i-1)*b12e(i)
                 b12e(i) = work(iu1cs+i-1)*b12e(i) - work(iu1sn+i-1)*b12d(i)
                 b12d(i) = temp
                 b12bulge = work(iu1sn+i-1)*b12d(i+1)
                 b12d(i+1) = work(iu1cs+i-1)*b12d(i+1)
                 temp = work(iu2cs+i-1)*b22d(i) + work(iu2sn+i-1)*b22e(i)
                 b22e(i) = work(iu2cs+i-1)*b22e(i) - work(iu2sn+i-1)*b22d(i)
                 b22d(i) = temp
                 b22bulge = work(iu2sn+i-1)*b22d(i+1)
                 b22d(i+1) = work(iu2cs+i-1)*b22d(i+1)
              end do
              ! compute phi(imax-1)
              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
              phi(imax-1) = atan2( abs(x1), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
              restart12 = b12d(imax-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
              restart22 = b22d(imax-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
              if( .not. restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_slartgp( y2, y1, work(iv2tsn+imax-1-1),work(iv2tcs+imax-1-1), r )
                           
              else if( .not. restart12 .and. restart22 ) then
                 call stdlib${ii}$_slartgp( b12bulge, b12d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
                           imax-1-1), r )
              else if( restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_slartgp( b22bulge, b22d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
                           imax-1-1), r )
              else if( nu < mu ) then
                 call stdlib${ii}$_slartgs( b12e(imax-1), b12d(imax), nu,work(iv2tcs+imax-1-1), work(&
                           iv2tsn+imax-1-1) )
              else
                 call stdlib${ii}$_slartgs( b22e(imax-1), b22d(imax), mu,work(iv2tcs+imax-1-1), work(&
                           iv2tsn+imax-1-1) )
              end if
              temp = work(iv2tcs+imax-1-1)*b12e(imax-1) +work(iv2tsn+imax-1-1)*b12d(imax)
              b12d(imax) = work(iv2tcs+imax-1-1)*b12d(imax) -work(iv2tsn+imax-1-1)*b12e(imax-1)
                        
              b12e(imax-1) = temp
              temp = work(iv2tcs+imax-1-1)*b22e(imax-1) +work(iv2tsn+imax-1-1)*b22d(imax)
              b22d(imax) = work(iv2tcs+imax-1-1)*b22d(imax) -work(iv2tsn+imax-1-1)*b22e(imax-1)
                        
              b22e(imax-1) = temp
              ! update singular vectors
              if( wantu1 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_slasr( 'R', 'V', 'F', p, imax-imin+1,work(iu1cs+imin-1), work(&
                              iu1sn+imin-1),u1(1_${ik}$,imin), ldu1 )
                 else
                    call stdlib${ii}$_slasr( 'L', 'V', 'F', imax-imin+1, p,work(iu1cs+imin-1), work(&
                              iu1sn+imin-1),u1(imin,1_${ik}$), ldu1 )
                 end if
              end if
              if( wantu2 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_slasr( 'R', 'V', 'F', m-p, imax-imin+1,work(iu2cs+imin-1), work(&
                              iu2sn+imin-1),u2(1_${ik}$,imin), ldu2 )
                 else
                    call stdlib${ii}$_slasr( 'L', 'V', 'F', imax-imin+1, m-p,work(iu2cs+imin-1), work(&
                              iu2sn+imin-1),u2(imin,1_${ik}$), ldu2 )
                 end if
              end if
              if( wantv1t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_slasr( 'L', 'V', 'F', imax-imin+1, q,work(iv1tcs+imin-1), work(&
                              iv1tsn+imin-1),v1t(imin,1_${ik}$), ldv1t )
                 else
                    call stdlib${ii}$_slasr( 'R', 'V', 'F', q, imax-imin+1,work(iv1tcs+imin-1), work(&
                              iv1tsn+imin-1),v1t(1_${ik}$,imin), ldv1t )
                 end if
              end if
              if( wantv2t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_slasr( 'L', 'V', 'F', imax-imin+1, m-q,work(iv2tcs+imin-1), work(&
                              iv2tsn+imin-1),v2t(imin,1_${ik}$), ldv2t )
                 else
                    call stdlib${ii}$_slasr( 'R', 'V', 'F', m-q, imax-imin+1,work(iv2tcs+imin-1), work(&
                              iv2tsn+imin-1),v2t(1_${ik}$,imin), ldv2t )
                 end if
              end if
              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
              if( b11e(imax-1)+b21e(imax-1) > 0_${ik}$ ) then
                 b11d(imax) = -b11d(imax)
                 b21d(imax) = -b21d(imax)
                 if( wantv1t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_sscal( q, negone, v1t(imax,1_${ik}$), ldv1t )
                    else
                       call stdlib${ii}$_sscal( q, negone, v1t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! compute theta(imax)
              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
              theta(imax) = atan2( abs(y1), abs(x1) )
              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
              ! and b22(imax,imax-1)
              if( b11d(imax)+b12e(imax-1) < 0_${ik}$ ) then
                 b12d(imax) = -b12d(imax)
                 if( wantu1 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_sscal( p, negone, u1(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_sscal( p, negone, u1(imax,1_${ik}$), ldu1 )
                    end if
                 end if
              end if
              if( b21d(imax)+b22e(imax-1) > 0_${ik}$ ) then
                 b22d(imax) = -b22d(imax)
                 if( wantu2 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_sscal( m-p, negone, u2(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_sscal( m-p, negone, u2(imax,1_${ik}$), ldu2 )
                    end if
                 end if
              end if
              ! fix signs on b12(imax,imax) and b22(imax,imax)
              if( b12d(imax)+b22d(imax) < 0_${ik}$ ) then
                 if( wantv2t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_sscal( m-q, negone, v2t(imax,1_${ik}$), ldv2t )
                    else
                       call stdlib${ii}$_sscal( m-q, negone, v2t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! test for negligible sines or cosines
              do i = imin, imax
                 if( theta(i) < thresh ) then
                    theta(i) = zero
                 else if( theta(i) > piover2-thresh ) then
                    theta(i) = piover2
                 end if
              end do
              do i = imin, imax-1
                 if( phi(i) < thresh ) then
                    phi(i) = zero
                 else if( phi(i) > piover2-thresh ) then
                    phi(i) = piover2
                 end if
              end do
              ! deflate
              if (imax > 1_${ik}$) then
                 do while( phi(imax-1) == zero )
                    imax = imax - 1_${ik}$
                    if (imax <= 1) exit
                 end do
              end if
              if( imin > imax - 1_${ik}$ )imin = imax - 1_${ik}$
              if (imin > 1_${ik}$) then
                 do while (phi(imin-1) /= zero)
                     imin = imin - 1_${ik}$
                     if (imin <= 1) exit
                 end do
              end if
              ! repeat main iteration loop
           end do
           ! postprocessing: order theta from least to greatest
           do i = 1, q
              mini = i
              thetamin = theta(i)
              do j = i+1, q
                 if( theta(j) < thetamin ) then
                    mini = j
                    thetamin = theta(j)
                 end if
              end do
              if( mini /= i ) then
                 theta(mini) = theta(i)
                 theta(i) = thetamin
                 if( colmajor ) then
                    if( wantu1 )call stdlib${ii}$_sswap( p, u1(1_${ik}$,i), 1_${ik}$, u1(1_${ik}$,mini), 1_${ik}$ )
                    if( wantu2 )call stdlib${ii}$_sswap( m-p, u2(1_${ik}$,i), 1_${ik}$, u2(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv1t )call stdlib${ii}$_sswap( q, v1t(i,1_${ik}$), ldv1t, v1t(mini,1_${ik}$), ldv1t )
                              
                    if( wantv2t )call stdlib${ii}$_sswap( m-q, v2t(i,1_${ik}$), ldv2t, v2t(mini,1_${ik}$),ldv2t )
                              
                 else
                    if( wantu1 )call stdlib${ii}$_sswap( p, u1(i,1_${ik}$), ldu1, u1(mini,1_${ik}$), ldu1 )
                    if( wantu2 )call stdlib${ii}$_sswap( m-p, u2(i,1_${ik}$), ldu2, u2(mini,1_${ik}$), ldu2 )
                    if( wantv1t )call stdlib${ii}$_sswap( q, v1t(1_${ik}$,i), 1_${ik}$, v1t(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv2t )call stdlib${ii}$_sswap( m-q, v2t(1_${ik}$,i), 1_${ik}$, v2t(1_${ik}$,mini), 1_${ik}$ )
                 end if
              end if
           end do
           return
     end subroutine stdlib${ii}$_sbbcsd

     pure module subroutine stdlib${ii}$_dbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
     !! DBBCSD computes the CS decomposition of an orthogonal matrix in
     !! bidiagonal-block form,
     !! [ B11 | B12 0  0 ]
     !! [  0  |  0 -I  0 ]
     !! X = [----------------]
     !! [ B21 | B22 0  0 ]
     !! [  0  |  0  0  I ]
     !! [  C | -S  0  0 ]
     !! [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**T
     !! = [---------] [---------------] [---------]   .
     !! [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
     !! [  0 |  0  0  I ]
     !! X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
     !! than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
     !! transposed and/or permuted. This can be done in constant time using
     !! the TRANS and SIGNS options. See DORCSD for details.)
     !! The bidiagonal matrices B11, B12, B21, and B22 are represented
     !! implicitly by angles THETA(1:Q) and PHI(1:Q-1).
     !! The orthogonal matrices U1, U2, V1T, and V2T are input/output.
     !! The input matrices are pre- or post-multiplied by the appropriate
     !! singular vector matrices.
     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, work, &
               lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, one, ten, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lwork, m, p, q
           ! Array Arguments 
           real(dp), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
                      b22e(*), work(*)
           real(dp), intent(inout) :: phi(*), theta(*)
           real(dp), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
        ! ===================================================================
           ! Parameters 
           integer(${ik}$), parameter :: maxitr = 6_${ik}$
           real(dp), parameter :: hundred = 100.0_dp
           real(dp), parameter :: meighth = -0.125_dp
           real(dp), parameter :: piover2 = 1.57079632679489661923132169163975144210_dp
           
           
           
           
           ! Local Scalars 
           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
                     wantu2, wantv1t, wantv2t
           integer(${ik}$) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
                     iv2tcs, iv2tsn, j, lworkmin, lworkopt, maxit, mini
           real(dp) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
           ! Intrinsic Functions 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           if( m < 0_${ik}$ ) then
              info = -6_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -7_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -8_${ik}$
           else if( q > p .or. q > m-p .or. q > m-q ) then
              info = -8_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -12_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -14_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -16_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -18_${ik}$
           end if
           ! quick return if q = 0
           if( info == 0_${ik}$ .and. q == 0_${ik}$ ) then
              lworkmin = 1_${ik}$
              work(1_${ik}$) = lworkmin
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              iu1cs = 1_${ik}$
              iu1sn = iu1cs + q
              iu2cs = iu1sn + q
              iu2sn = iu2cs + q
              iv1tcs = iu2sn + q
              iv1tsn = iv1tcs + q
              iv2tcs = iv1tsn + q
              iv2tsn = iv2tcs + q
              lworkopt = iv2tsn + q - 1_${ik}$
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not. lquery ) then
                 info = -28_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'DBBCSD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! get machine constants
           eps = stdlib${ii}$_dlamch( 'EPSILON' )
           unfl = stdlib${ii}$_dlamch( 'SAFE MINIMUM' )
           tolmul = max( ten, min( hundred, eps**meighth ) )
           tol = tolmul*eps
           thresh = max( tol, maxitr*q*q*unfl )
           ! test for negligible sines or cosines
           do i = 1, q
              if( theta(i) < thresh ) then
                 theta(i) = zero
              else if( theta(i) > piover2-thresh ) then
                 theta(i) = piover2
              end if
           end do
           do i = 1, q-1
              if( phi(i) < thresh ) then
                 phi(i) = zero
              else if( phi(i) > piover2-thresh ) then
                 phi(i) = piover2
              end if
           end do
           ! initial deflation
           imax = q
           do while( imax > 1 )
              if( phi(imax-1) /= zero ) then
                 exit
              end if
              imax = imax - 1_${ik}$
           end do
           imin = imax - 1_${ik}$
           if  ( imin > 1_${ik}$ ) then
              do while( phi(imin-1) /= zero )
                 imin = imin - 1_${ik}$
                 if  ( imin <= 1 ) exit
              end do
           end if
           ! initialize iteration counter
           maxit = maxitr*q*q
           iter = 0_${ik}$
           ! begin main iteration loop
           do while( imax > 1 )
              ! compute the matrix entries
              b11d(imin) = cos( theta(imin) )
              b21d(imin) = -sin( theta(imin) )
              do i = imin, imax - 1
                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
              end do
              b12d(imax) = sin( theta(imax) )
              b22d(imax) = cos( theta(imax) )
              ! abort if not converging; otherwise, increment iter
              if( iter > maxit ) then
                 info = 0_${ik}$
                 do i = 1, q
                    if( phi(i) /= zero )info = info + 1_${ik}$
                 end do
                 return
              end if
              iter = iter + imax - imin
              ! compute shifts
              thetamax = theta(imin)
              thetamin = theta(imin)
              do i = imin+1, imax
                 if( theta(i) > thetamax )thetamax = theta(i)
                 if( theta(i) < thetamin )thetamin = theta(i)
              end do
              if( thetamax > piover2 - thresh ) then
                 ! zero on diagonals of b11 and b22; induce deflation with a
                 ! zero shift
                 mu = zero
                 nu = one
              else if( thetamin < thresh ) then
                 ! zero on diagonals of b12 and b22; induce deflation with a
                 ! zero shift
                 mu = one
                 nu = zero
              else
                 ! compute shifts for b11 and b21 and use the lesser
                 call stdlib${ii}$_dlas2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
                           
                 call stdlib${ii}$_dlas2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
                           
                 if( sigma11 <= sigma21 ) then
                    mu = sigma11
                    nu = sqrt( one - mu**2_${ik}$ )
                    if( mu < thresh ) then
                       mu = zero
                       nu = one
                    end if
                 else
                    nu = sigma21
                    mu = sqrt( one - nu**2_${ik}$ )
                    if( nu < thresh ) then
                       mu = one
                       nu = zero
                    end if
                 end if
              end if
              ! rotate to produce bulges in b11 and b21
              if( mu <= nu ) then
                 call stdlib${ii}$_dlartgs( b11d(imin), b11e(imin), mu,work(iv1tcs+imin-1), work(iv1tsn+&
                           imin-1) )
              else
                 call stdlib${ii}$_dlartgs( b21d(imin), b21e(imin), nu,work(iv1tcs+imin-1), work(iv1tsn+&
                           imin-1) )
              end if
              temp = work(iv1tcs+imin-1)*b11d(imin) +work(iv1tsn+imin-1)*b11e(imin)
              b11e(imin) = work(iv1tcs+imin-1)*b11e(imin) -work(iv1tsn+imin-1)*b11d(imin)
              b11d(imin) = temp
              b11bulge = work(iv1tsn+imin-1)*b11d(imin+1)
              b11d(imin+1) = work(iv1tcs+imin-1)*b11d(imin+1)
              temp = work(iv1tcs+imin-1)*b21d(imin) +work(iv1tsn+imin-1)*b21e(imin)
              b21e(imin) = work(iv1tcs+imin-1)*b21e(imin) -work(iv1tsn+imin-1)*b21d(imin)
              b21d(imin) = temp
              b21bulge = work(iv1tsn+imin-1)*b21d(imin+1)
              b21d(imin+1) = work(iv1tcs+imin-1)*b21d(imin+1)
              ! compute theta(imin)
              theta( imin ) = atan2( sqrt( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ ),sqrt( b11d(imin)**2_${ik}$+&
                        b11bulge**2_${ik}$ ) )
              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
              if( b11d(imin)**2_${ik}$+b11bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_dlartgp( b11bulge, b11d(imin), work(iu1sn+imin-1),work(iu1cs+imin-1),&
                            r )
              else if( mu <= nu ) then
                 call stdlib${ii}$_dlartgs( b11e( imin ), b11d( imin + 1_${ik}$ ), mu,work(iu1cs+imin-1), work(&
                           iu1sn+imin-1) )
              else
                 call stdlib${ii}$_dlartgs( b12d( imin ), b12e( imin ), nu,work(iu1cs+imin-1), work(&
                           iu1sn+imin-1) )
              end if
              if( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_dlartgp( b21bulge, b21d(imin), work(iu2sn+imin-1),work(iu2cs+imin-1),&
                            r )
              else if( nu < mu ) then
                 call stdlib${ii}$_dlartgs( b21e( imin ), b21d( imin + 1_${ik}$ ), nu,work(iu2cs+imin-1), work(&
                           iu2sn+imin-1) )
              else
                 call stdlib${ii}$_dlartgs( b22d(imin), b22e(imin), mu,work(iu2cs+imin-1), work(iu2sn+&
                           imin-1) )
              end if
              work(iu2cs+imin-1) = -work(iu2cs+imin-1)
              work(iu2sn+imin-1) = -work(iu2sn+imin-1)
              temp = work(iu1cs+imin-1)*b11e(imin) +work(iu1sn+imin-1)*b11d(imin+1)
              b11d(imin+1) = work(iu1cs+imin-1)*b11d(imin+1) -work(iu1sn+imin-1)*b11e(imin)
                        
              b11e(imin) = temp
              if( imax > imin+1 ) then
                 b11bulge = work(iu1sn+imin-1)*b11e(imin+1)
                 b11e(imin+1) = work(iu1cs+imin-1)*b11e(imin+1)
              end if
              temp = work(iu1cs+imin-1)*b12d(imin) +work(iu1sn+imin-1)*b12e(imin)
              b12e(imin) = work(iu1cs+imin-1)*b12e(imin) -work(iu1sn+imin-1)*b12d(imin)
              b12d(imin) = temp
              b12bulge = work(iu1sn+imin-1)*b12d(imin+1)
              b12d(imin+1) = work(iu1cs+imin-1)*b12d(imin+1)
              temp = work(iu2cs+imin-1)*b21e(imin) +work(iu2sn+imin-1)*b21d(imin+1)
              b21d(imin+1) = work(iu2cs+imin-1)*b21d(imin+1) -work(iu2sn+imin-1)*b21e(imin)
                        
              b21e(imin) = temp
              if( imax > imin+1 ) then
                 b21bulge = work(iu2sn+imin-1)*b21e(imin+1)
                 b21e(imin+1) = work(iu2cs+imin-1)*b21e(imin+1)
              end if
              temp = work(iu2cs+imin-1)*b22d(imin) +work(iu2sn+imin-1)*b22e(imin)
              b22e(imin) = work(iu2cs+imin-1)*b22e(imin) -work(iu2sn+imin-1)*b22d(imin)
              b22d(imin) = temp
              b22bulge = work(iu2sn+imin-1)*b22d(imin+1)
              b22d(imin+1) = work(iu2cs+imin-1)*b22d(imin+1)
              ! inner loop: chase bulges from b11(imin,imin+2),
              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
              ! bottom-right
              do i = imin+1, imax-1
                 ! compute phi(i-1)
                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
                 phi(i-1) = atan2( sqrt(x1**2_${ik}$+x2**2_${ik}$), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 = b11e(i-1)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 = b21e(i-1)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12d(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22d(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_dlartgp( x2, x1, work(iv1tsn+i-1), work(iv1tcs+i-1),r )
                 else if( .not. restart11 .and. restart21 ) then
                    call stdlib${ii}$_dlartgp( b11bulge, b11e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
                              r )
                 else if( restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_dlartgp( b21bulge, b21e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
                              r )
                 else if( mu <= nu ) then
                    call stdlib${ii}$_dlartgs( b11d(i), b11e(i), mu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
                              
                 else
                    call stdlib${ii}$_dlartgs( b21d(i), b21e(i), nu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
                              
                 end if
                 work(iv1tcs+i-1) = -work(iv1tcs+i-1)
                 work(iv1tsn+i-1) = -work(iv1tsn+i-1)
                 if( .not. restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_dlartgp( y2, y1, work(iv2tsn+i-1-1),work(iv2tcs+i-1-1), r )
                              
                 else if( .not. restart12 .and. restart22 ) then
                    call stdlib${ii}$_dlartgp( b12bulge, b12d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
                              1_${ik}$), r )
                 else if( restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_dlartgp( b22bulge, b22d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
                              1_${ik}$), r )
                 else if( nu < mu ) then
                    call stdlib${ii}$_dlartgs( b12e(i-1), b12d(i), nu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
                              1_${ik}$-1) )
                 else
                    call stdlib${ii}$_dlartgs( b22e(i-1), b22d(i), mu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
                              1_${ik}$-1) )
                 end if
                 temp = work(iv1tcs+i-1)*b11d(i) + work(iv1tsn+i-1)*b11e(i)
                 b11e(i) = work(iv1tcs+i-1)*b11e(i) -work(iv1tsn+i-1)*b11d(i)
                 b11d(i) = temp
                 b11bulge = work(iv1tsn+i-1)*b11d(i+1)
                 b11d(i+1) = work(iv1tcs+i-1)*b11d(i+1)
                 temp = work(iv1tcs+i-1)*b21d(i) + work(iv1tsn+i-1)*b21e(i)
                 b21e(i) = work(iv1tcs+i-1)*b21e(i) -work(iv1tsn+i-1)*b21d(i)
                 b21d(i) = temp
                 b21bulge = work(iv1tsn+i-1)*b21d(i+1)
                 b21d(i+1) = work(iv1tcs+i-1)*b21d(i+1)
                 temp = work(iv2tcs+i-1-1)*b12e(i-1) +work(iv2tsn+i-1-1)*b12d(i)
                 b12d(i) = work(iv2tcs+i-1-1)*b12d(i) -work(iv2tsn+i-1-1)*b12e(i-1)
                 b12e(i-1) = temp
                 b12bulge = work(iv2tsn+i-1-1)*b12e(i)
                 b12e(i) = work(iv2tcs+i-1-1)*b12e(i)
                 temp = work(iv2tcs+i-1-1)*b22e(i-1) +work(iv2tsn+i-1-1)*b22d(i)
                 b22d(i) = work(iv2tcs+i-1-1)*b22d(i) -work(iv2tsn+i-1-1)*b22e(i-1)
                 b22e(i-1) = temp
                 b22bulge = work(iv2tsn+i-1-1)*b22e(i)
                 b22e(i) = work(iv2tcs+i-1-1)*b22e(i)
                 ! compute theta(i)
                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
                 theta(i) = atan2( sqrt(y1**2_${ik}$+y2**2_${ik}$), sqrt(x1**2_${ik}$+x2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 =   b11d(i)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12e(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 =   b21d(i)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22e(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_dlartgp( x2, x1, work(iu1sn+i-1), work(iu1cs+i-1),r )
                 else if( .not. restart11 .and. restart12 ) then
                    call stdlib${ii}$_dlartgp( b11bulge, b11d(i), work(iu1sn+i-1),work(iu1cs+i-1), r )
                              
                 else if( restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_dlartgp( b12bulge, b12e(i-1), work(iu1sn+i-1),work(iu1cs+i-1), r )
                              
                 else if( mu <= nu ) then
                    call stdlib${ii}$_dlartgs( b11e(i), b11d(i+1), mu, work(iu1cs+i-1),work(iu1sn+i-1) )
                              
                 else
                    call stdlib${ii}$_dlartgs( b12d(i), b12e(i), nu, work(iu1cs+i-1),work(iu1sn+i-1) )
                              
                 end if
                 if( .not. restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_dlartgp( y2, y1, work(iu2sn+i-1), work(iu2cs+i-1),r )
                 else if( .not. restart21 .and. restart22 ) then
                    call stdlib${ii}$_dlartgp( b21bulge, b21d(i), work(iu2sn+i-1),work(iu2cs+i-1), r )
                              
                 else if( restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_dlartgp( b22bulge, b22e(i-1), work(iu2sn+i-1),work(iu2cs+i-1), r )
                              
                 else if( nu < mu ) then
                    call stdlib${ii}$_dlartgs( b21e(i), b21e(i+1), nu, work(iu2cs+i-1),work(iu2sn+i-1) )
                              
                 else
                    call stdlib${ii}$_dlartgs( b22d(i), b22e(i), mu, work(iu2cs+i-1),work(iu2sn+i-1) )
                              
                 end if
                 work(iu2cs+i-1) = -work(iu2cs+i-1)
                 work(iu2sn+i-1) = -work(iu2sn+i-1)
                 temp = work(iu1cs+i-1)*b11e(i) + work(iu1sn+i-1)*b11d(i+1)
                 b11d(i+1) = work(iu1cs+i-1)*b11d(i+1) -work(iu1sn+i-1)*b11e(i)
                 b11e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b11bulge = work(iu1sn+i-1)*b11e(i+1)
                    b11e(i+1) = work(iu1cs+i-1)*b11e(i+1)
                 end if
                 temp = work(iu2cs+i-1)*b21e(i) + work(iu2sn+i-1)*b21d(i+1)
                 b21d(i+1) = work(iu2cs+i-1)*b21d(i+1) -work(iu2sn+i-1)*b21e(i)
                 b21e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b21bulge = work(iu2sn+i-1)*b21e(i+1)
                    b21e(i+1) = work(iu2cs+i-1)*b21e(i+1)
                 end if
                 temp = work(iu1cs+i-1)*b12d(i) + work(iu1sn+i-1)*b12e(i)
                 b12e(i) = work(iu1cs+i-1)*b12e(i) - work(iu1sn+i-1)*b12d(i)
                 b12d(i) = temp
                 b12bulge = work(iu1sn+i-1)*b12d(i+1)
                 b12d(i+1) = work(iu1cs+i-1)*b12d(i+1)
                 temp = work(iu2cs+i-1)*b22d(i) + work(iu2sn+i-1)*b22e(i)
                 b22e(i) = work(iu2cs+i-1)*b22e(i) - work(iu2sn+i-1)*b22d(i)
                 b22d(i) = temp
                 b22bulge = work(iu2sn+i-1)*b22d(i+1)
                 b22d(i+1) = work(iu2cs+i-1)*b22d(i+1)
              end do
              ! compute phi(imax-1)
              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
              phi(imax-1) = atan2( abs(x1), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
              restart12 = b12d(imax-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
              restart22 = b22d(imax-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
              if( .not. restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_dlartgp( y2, y1, work(iv2tsn+imax-1-1),work(iv2tcs+imax-1-1), r )
                           
              else if( .not. restart12 .and. restart22 ) then
                 call stdlib${ii}$_dlartgp( b12bulge, b12d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
                           imax-1-1), r )
              else if( restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_dlartgp( b22bulge, b22d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
                           imax-1-1), r )
              else if( nu < mu ) then
                 call stdlib${ii}$_dlartgs( b12e(imax-1), b12d(imax), nu,work(iv2tcs+imax-1-1), work(&
                           iv2tsn+imax-1-1) )
              else
                 call stdlib${ii}$_dlartgs( b22e(imax-1), b22d(imax), mu,work(iv2tcs+imax-1-1), work(&
                           iv2tsn+imax-1-1) )
              end if
              temp = work(iv2tcs+imax-1-1)*b12e(imax-1) +work(iv2tsn+imax-1-1)*b12d(imax)
              b12d(imax) = work(iv2tcs+imax-1-1)*b12d(imax) -work(iv2tsn+imax-1-1)*b12e(imax-1)
                        
              b12e(imax-1) = temp
              temp = work(iv2tcs+imax-1-1)*b22e(imax-1) +work(iv2tsn+imax-1-1)*b22d(imax)
              b22d(imax) = work(iv2tcs+imax-1-1)*b22d(imax) -work(iv2tsn+imax-1-1)*b22e(imax-1)
                        
              b22e(imax-1) = temp
              ! update singular vectors
              if( wantu1 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_dlasr( 'R', 'V', 'F', p, imax-imin+1,work(iu1cs+imin-1), work(&
                              iu1sn+imin-1),u1(1_${ik}$,imin), ldu1 )
                 else
                    call stdlib${ii}$_dlasr( 'L', 'V', 'F', imax-imin+1, p,work(iu1cs+imin-1), work(&
                              iu1sn+imin-1),u1(imin,1_${ik}$), ldu1 )
                 end if
              end if
              if( wantu2 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_dlasr( 'R', 'V', 'F', m-p, imax-imin+1,work(iu2cs+imin-1), work(&
                              iu2sn+imin-1),u2(1_${ik}$,imin), ldu2 )
                 else
                    call stdlib${ii}$_dlasr( 'L', 'V', 'F', imax-imin+1, m-p,work(iu2cs+imin-1), work(&
                              iu2sn+imin-1),u2(imin,1_${ik}$), ldu2 )
                 end if
              end if
              if( wantv1t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_dlasr( 'L', 'V', 'F', imax-imin+1, q,work(iv1tcs+imin-1), work(&
                              iv1tsn+imin-1),v1t(imin,1_${ik}$), ldv1t )
                 else
                    call stdlib${ii}$_dlasr( 'R', 'V', 'F', q, imax-imin+1,work(iv1tcs+imin-1), work(&
                              iv1tsn+imin-1),v1t(1_${ik}$,imin), ldv1t )
                 end if
              end if
              if( wantv2t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_dlasr( 'L', 'V', 'F', imax-imin+1, m-q,work(iv2tcs+imin-1), work(&
                              iv2tsn+imin-1),v2t(imin,1_${ik}$), ldv2t )
                 else
                    call stdlib${ii}$_dlasr( 'R', 'V', 'F', m-q, imax-imin+1,work(iv2tcs+imin-1), work(&
                              iv2tsn+imin-1),v2t(1_${ik}$,imin), ldv2t )
                 end if
              end if
              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
              if( b11e(imax-1)+b21e(imax-1) > 0_${ik}$ ) then
                 b11d(imax) = -b11d(imax)
                 b21d(imax) = -b21d(imax)
                 if( wantv1t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_dscal( q, negone, v1t(imax,1_${ik}$), ldv1t )
                    else
                       call stdlib${ii}$_dscal( q, negone, v1t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! compute theta(imax)
              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
              theta(imax) = atan2( abs(y1), abs(x1) )
              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
              ! and b22(imax,imax-1)
              if( b11d(imax)+b12e(imax-1) < 0_${ik}$ ) then
                 b12d(imax) = -b12d(imax)
                 if( wantu1 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_dscal( p, negone, u1(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_dscal( p, negone, u1(imax,1_${ik}$), ldu1 )
                    end if
                 end if
              end if
              if( b21d(imax)+b22e(imax-1) > 0_${ik}$ ) then
                 b22d(imax) = -b22d(imax)
                 if( wantu2 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_dscal( m-p, negone, u2(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_dscal( m-p, negone, u2(imax,1_${ik}$), ldu2 )
                    end if
                 end if
              end if
              ! fix signs on b12(imax,imax) and b22(imax,imax)
              if( b12d(imax)+b22d(imax) < 0_${ik}$ ) then
                 if( wantv2t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_dscal( m-q, negone, v2t(imax,1_${ik}$), ldv2t )
                    else
                       call stdlib${ii}$_dscal( m-q, negone, v2t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! test for negligible sines or cosines
              do i = imin, imax
                 if( theta(i) < thresh ) then
                    theta(i) = zero
                 else if( theta(i) > piover2-thresh ) then
                    theta(i) = piover2
                 end if
              end do
              do i = imin, imax-1
                 if( phi(i) < thresh ) then
                    phi(i) = zero
                 else if( phi(i) > piover2-thresh ) then
                    phi(i) = piover2
                 end if
              end do
              ! deflate
              if (imax > 1_${ik}$) then
                 do while( phi(imax-1) == zero )
                    imax = imax - 1_${ik}$
                    if (imax <= 1) exit
                 end do
              end if
              if( imin > imax - 1_${ik}$ )imin = imax - 1_${ik}$
              if (imin > 1_${ik}$) then
                 do while (phi(imin-1) /= zero)
                     imin = imin - 1_${ik}$
                     if (imin <= 1) exit
                 end do
              end if
              ! repeat main iteration loop
           end do
           ! postprocessing: order theta from least to greatest
           do i = 1, q
              mini = i
              thetamin = theta(i)
              do j = i+1, q
                 if( theta(j) < thetamin ) then
                    mini = j
                    thetamin = theta(j)
                 end if
              end do
              if( mini /= i ) then
                 theta(mini) = theta(i)
                 theta(i) = thetamin
                 if( colmajor ) then
                    if( wantu1 )call stdlib${ii}$_dswap( p, u1(1_${ik}$,i), 1_${ik}$, u1(1_${ik}$,mini), 1_${ik}$ )
                    if( wantu2 )call stdlib${ii}$_dswap( m-p, u2(1_${ik}$,i), 1_${ik}$, u2(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv1t )call stdlib${ii}$_dswap( q, v1t(i,1_${ik}$), ldv1t, v1t(mini,1_${ik}$), ldv1t )
                              
                    if( wantv2t )call stdlib${ii}$_dswap( m-q, v2t(i,1_${ik}$), ldv2t, v2t(mini,1_${ik}$),ldv2t )
                              
                 else
                    if( wantu1 )call stdlib${ii}$_dswap( p, u1(i,1_${ik}$), ldu1, u1(mini,1_${ik}$), ldu1 )
                    if( wantu2 )call stdlib${ii}$_dswap( m-p, u2(i,1_${ik}$), ldu2, u2(mini,1_${ik}$), ldu2 )
                    if( wantv1t )call stdlib${ii}$_dswap( q, v1t(1_${ik}$,i), 1_${ik}$, v1t(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv2t )call stdlib${ii}$_dswap( m-q, v2t(1_${ik}$,i), 1_${ik}$, v2t(1_${ik}$,mini), 1_${ik}$ )
                 end if
              end if
           end do
           return
     end subroutine stdlib${ii}$_dbbcsd

#:for rk,rt,ri in REAL_KINDS_TYPES
#:if not rk in ["sp","dp"]
     pure module subroutine stdlib${ii}$_${ri}$bbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
     !! DBBCSD: computes the CS decomposition of an orthogonal matrix in
     !! bidiagonal-block form,
     !! [ B11 | B12 0  0 ]
     !! [  0  |  0 -I  0 ]
     !! X = [----------------]
     !! [ B21 | B22 0  0 ]
     !! [  0  |  0  0  I ]
     !! [  C | -S  0  0 ]
     !! [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**T
     !! = [---------] [---------------] [---------]   .
     !! [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
     !! [  0 |  0  0  I ]
     !! X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
     !! than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
     !! transposed and/or permuted. This can be done in constant time using
     !! the TRANS and SIGNS options. See DORCSD for details.)
     !! The bidiagonal matrices B11, B12, B21, and B22 are represented
     !! implicitly by angles THETA(1:Q) and PHI(1:Q-1).
     !! The orthogonal matrices U1, U2, V1T, and V2T are input/output.
     !! The input matrices are pre- or post-multiplied by the appropriate
     !! singular vector matrices.
     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, work, &
               lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${rk}$, only: negone, zero, one, ten, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lwork, m, p, q
           ! Array Arguments 
           real(${rk}$), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
                      b22e(*), work(*)
           real(${rk}$), intent(inout) :: phi(*), theta(*)
           real(${rk}$), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
        ! ===================================================================
           ! Parameters 
           integer(${ik}$), parameter :: maxitr = 6_${ik}$
           real(${rk}$), parameter :: hundred = 100.0_${rk}$
           real(${rk}$), parameter :: meighth = -0.125_${rk}$
           real(${rk}$), parameter :: piover2 = 1.57079632679489661923132169163975144210_${rk}$
           
           
           
           
           ! Local Scalars 
           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
                     wantu2, wantv1t, wantv2t
           integer(${ik}$) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
                     iv2tcs, iv2tsn, j, lworkmin, lworkopt, maxit, mini
           real(${rk}$) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
           ! Intrinsic Functions 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           if( m < 0_${ik}$ ) then
              info = -6_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -7_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -8_${ik}$
           else if( q > p .or. q > m-p .or. q > m-q ) then
              info = -8_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -12_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -14_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -16_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -18_${ik}$
           end if
           ! quick return if q = 0
           if( info == 0_${ik}$ .and. q == 0_${ik}$ ) then
              lworkmin = 1_${ik}$
              work(1_${ik}$) = lworkmin
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              iu1cs = 1_${ik}$
              iu1sn = iu1cs + q
              iu2cs = iu1sn + q
              iu2sn = iu2cs + q
              iv1tcs = iu2sn + q
              iv1tsn = iv1tcs + q
              iv2tcs = iv1tsn + q
              iv2tsn = iv2tcs + q
              lworkopt = iv2tsn + q - 1_${ik}$
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not. lquery ) then
                 info = -28_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'DBBCSD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! get machine constants
           eps = stdlib${ii}$_${ri}$lamch( 'EPSILON' )
           unfl = stdlib${ii}$_${ri}$lamch( 'SAFE MINIMUM' )
           tolmul = max( ten, min( hundred, eps**meighth ) )
           tol = tolmul*eps
           thresh = max( tol, maxitr*q*q*unfl )
           ! test for negligible sines or cosines
           do i = 1, q
              if( theta(i) < thresh ) then
                 theta(i) = zero
              else if( theta(i) > piover2-thresh ) then
                 theta(i) = piover2
              end if
           end do
           do i = 1, q-1
              if( phi(i) < thresh ) then
                 phi(i) = zero
              else if( phi(i) > piover2-thresh ) then
                 phi(i) = piover2
              end if
           end do
           ! initial deflation
           imax = q
           do while( imax > 1 )
              if( phi(imax-1) /= zero ) then
                 exit
              end if
              imax = imax - 1_${ik}$
           end do
           imin = imax - 1_${ik}$
           if  ( imin > 1_${ik}$ ) then
              do while( phi(imin-1) /= zero )
                 imin = imin - 1_${ik}$
                 if  ( imin <= 1 ) exit
              end do
           end if
           ! initialize iteration counter
           maxit = maxitr*q*q
           iter = 0_${ik}$
           ! begin main iteration loop
           do while( imax > 1 )
              ! compute the matrix entries
              b11d(imin) = cos( theta(imin) )
              b21d(imin) = -sin( theta(imin) )
              do i = imin, imax - 1
                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
              end do
              b12d(imax) = sin( theta(imax) )
              b22d(imax) = cos( theta(imax) )
              ! abort if not converging; otherwise, increment iter
              if( iter > maxit ) then
                 info = 0_${ik}$
                 do i = 1, q
                    if( phi(i) /= zero )info = info + 1_${ik}$
                 end do
                 return
              end if
              iter = iter + imax - imin
              ! compute shifts
              thetamax = theta(imin)
              thetamin = theta(imin)
              do i = imin+1, imax
                 if( theta(i) > thetamax )thetamax = theta(i)
                 if( theta(i) < thetamin )thetamin = theta(i)
              end do
              if( thetamax > piover2 - thresh ) then
                 ! zero on diagonals of b11 and b22; induce deflation with a
                 ! zero shift
                 mu = zero
                 nu = one
              else if( thetamin < thresh ) then
                 ! zero on diagonals of b12 and b22; induce deflation with a
                 ! zero shift
                 mu = one
                 nu = zero
              else
                 ! compute shifts for b11 and b21 and use the lesser
                 call stdlib${ii}$_${ri}$las2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
                           
                 call stdlib${ii}$_${ri}$las2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
                           
                 if( sigma11 <= sigma21 ) then
                    mu = sigma11
                    nu = sqrt( one - mu**2_${ik}$ )
                    if( mu < thresh ) then
                       mu = zero
                       nu = one
                    end if
                 else
                    nu = sigma21
                    mu = sqrt( one - nu**2_${ik}$ )
                    if( nu < thresh ) then
                       mu = one
                       nu = zero
                    end if
                 end if
              end if
              ! rotate to produce bulges in b11 and b21
              if( mu <= nu ) then
                 call stdlib${ii}$_${ri}$lartgs( b11d(imin), b11e(imin), mu,work(iv1tcs+imin-1), work(iv1tsn+&
                           imin-1) )
              else
                 call stdlib${ii}$_${ri}$lartgs( b21d(imin), b21e(imin), nu,work(iv1tcs+imin-1), work(iv1tsn+&
                           imin-1) )
              end if
              temp = work(iv1tcs+imin-1)*b11d(imin) +work(iv1tsn+imin-1)*b11e(imin)
              b11e(imin) = work(iv1tcs+imin-1)*b11e(imin) -work(iv1tsn+imin-1)*b11d(imin)
              b11d(imin) = temp
              b11bulge = work(iv1tsn+imin-1)*b11d(imin+1)
              b11d(imin+1) = work(iv1tcs+imin-1)*b11d(imin+1)
              temp = work(iv1tcs+imin-1)*b21d(imin) +work(iv1tsn+imin-1)*b21e(imin)
              b21e(imin) = work(iv1tcs+imin-1)*b21e(imin) -work(iv1tsn+imin-1)*b21d(imin)
              b21d(imin) = temp
              b21bulge = work(iv1tsn+imin-1)*b21d(imin+1)
              b21d(imin+1) = work(iv1tcs+imin-1)*b21d(imin+1)
              ! compute theta(imin)
              theta( imin ) = atan2( sqrt( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ ),sqrt( b11d(imin)**2_${ik}$+&
                        b11bulge**2_${ik}$ ) )
              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
              if( b11d(imin)**2_${ik}$+b11bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lartgp( b11bulge, b11d(imin), work(iu1sn+imin-1),work(iu1cs+imin-1),&
                            r )
              else if( mu <= nu ) then
                 call stdlib${ii}$_${ri}$lartgs( b11e( imin ), b11d( imin + 1_${ik}$ ), mu,work(iu1cs+imin-1), work(&
                           iu1sn+imin-1) )
              else
                 call stdlib${ii}$_${ri}$lartgs( b12d( imin ), b12e( imin ), nu,work(iu1cs+imin-1), work(&
                           iu1sn+imin-1) )
              end if
              if( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lartgp( b21bulge, b21d(imin), work(iu2sn+imin-1),work(iu2cs+imin-1),&
                            r )
              else if( nu < mu ) then
                 call stdlib${ii}$_${ri}$lartgs( b21e( imin ), b21d( imin + 1_${ik}$ ), nu,work(iu2cs+imin-1), work(&
                           iu2sn+imin-1) )
              else
                 call stdlib${ii}$_${ri}$lartgs( b22d(imin), b22e(imin), mu,work(iu2cs+imin-1), work(iu2sn+&
                           imin-1) )
              end if
              work(iu2cs+imin-1) = -work(iu2cs+imin-1)
              work(iu2sn+imin-1) = -work(iu2sn+imin-1)
              temp = work(iu1cs+imin-1)*b11e(imin) +work(iu1sn+imin-1)*b11d(imin+1)
              b11d(imin+1) = work(iu1cs+imin-1)*b11d(imin+1) -work(iu1sn+imin-1)*b11e(imin)
                        
              b11e(imin) = temp
              if( imax > imin+1 ) then
                 b11bulge = work(iu1sn+imin-1)*b11e(imin+1)
                 b11e(imin+1) = work(iu1cs+imin-1)*b11e(imin+1)
              end if
              temp = work(iu1cs+imin-1)*b12d(imin) +work(iu1sn+imin-1)*b12e(imin)
              b12e(imin) = work(iu1cs+imin-1)*b12e(imin) -work(iu1sn+imin-1)*b12d(imin)
              b12d(imin) = temp
              b12bulge = work(iu1sn+imin-1)*b12d(imin+1)
              b12d(imin+1) = work(iu1cs+imin-1)*b12d(imin+1)
              temp = work(iu2cs+imin-1)*b21e(imin) +work(iu2sn+imin-1)*b21d(imin+1)
              b21d(imin+1) = work(iu2cs+imin-1)*b21d(imin+1) -work(iu2sn+imin-1)*b21e(imin)
                        
              b21e(imin) = temp
              if( imax > imin+1 ) then
                 b21bulge = work(iu2sn+imin-1)*b21e(imin+1)
                 b21e(imin+1) = work(iu2cs+imin-1)*b21e(imin+1)
              end if
              temp = work(iu2cs+imin-1)*b22d(imin) +work(iu2sn+imin-1)*b22e(imin)
              b22e(imin) = work(iu2cs+imin-1)*b22e(imin) -work(iu2sn+imin-1)*b22d(imin)
              b22d(imin) = temp
              b22bulge = work(iu2sn+imin-1)*b22d(imin+1)
              b22d(imin+1) = work(iu2cs+imin-1)*b22d(imin+1)
              ! inner loop: chase bulges from b11(imin,imin+2),
              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
              ! bottom-right
              do i = imin+1, imax-1
                 ! compute phi(i-1)
                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
                 phi(i-1) = atan2( sqrt(x1**2_${ik}$+x2**2_${ik}$), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 = b11e(i-1)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 = b21e(i-1)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12d(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22d(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_${ri}$lartgp( x2, x1, work(iv1tsn+i-1), work(iv1tcs+i-1),r )
                 else if( .not. restart11 .and. restart21 ) then
                    call stdlib${ii}$_${ri}$lartgp( b11bulge, b11e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
                              r )
                 else if( restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_${ri}$lartgp( b21bulge, b21e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
                              r )
                 else if( mu <= nu ) then
                    call stdlib${ii}$_${ri}$lartgs( b11d(i), b11e(i), mu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
                              
                 else
                    call stdlib${ii}$_${ri}$lartgs( b21d(i), b21e(i), nu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
                              
                 end if
                 work(iv1tcs+i-1) = -work(iv1tcs+i-1)
                 work(iv1tsn+i-1) = -work(iv1tsn+i-1)
                 if( .not. restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_${ri}$lartgp( y2, y1, work(iv2tsn+i-1-1),work(iv2tcs+i-1-1), r )
                              
                 else if( .not. restart12 .and. restart22 ) then
                    call stdlib${ii}$_${ri}$lartgp( b12bulge, b12d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
                              1_${ik}$), r )
                 else if( restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_${ri}$lartgp( b22bulge, b22d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
                              1_${ik}$), r )
                 else if( nu < mu ) then
                    call stdlib${ii}$_${ri}$lartgs( b12e(i-1), b12d(i), nu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
                              1_${ik}$-1) )
                 else
                    call stdlib${ii}$_${ri}$lartgs( b22e(i-1), b22d(i), mu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
                              1_${ik}$-1) )
                 end if
                 temp = work(iv1tcs+i-1)*b11d(i) + work(iv1tsn+i-1)*b11e(i)
                 b11e(i) = work(iv1tcs+i-1)*b11e(i) -work(iv1tsn+i-1)*b11d(i)
                 b11d(i) = temp
                 b11bulge = work(iv1tsn+i-1)*b11d(i+1)
                 b11d(i+1) = work(iv1tcs+i-1)*b11d(i+1)
                 temp = work(iv1tcs+i-1)*b21d(i) + work(iv1tsn+i-1)*b21e(i)
                 b21e(i) = work(iv1tcs+i-1)*b21e(i) -work(iv1tsn+i-1)*b21d(i)
                 b21d(i) = temp
                 b21bulge = work(iv1tsn+i-1)*b21d(i+1)
                 b21d(i+1) = work(iv1tcs+i-1)*b21d(i+1)
                 temp = work(iv2tcs+i-1-1)*b12e(i-1) +work(iv2tsn+i-1-1)*b12d(i)
                 b12d(i) = work(iv2tcs+i-1-1)*b12d(i) -work(iv2tsn+i-1-1)*b12e(i-1)
                 b12e(i-1) = temp
                 b12bulge = work(iv2tsn+i-1-1)*b12e(i)
                 b12e(i) = work(iv2tcs+i-1-1)*b12e(i)
                 temp = work(iv2tcs+i-1-1)*b22e(i-1) +work(iv2tsn+i-1-1)*b22d(i)
                 b22d(i) = work(iv2tcs+i-1-1)*b22d(i) -work(iv2tsn+i-1-1)*b22e(i-1)
                 b22e(i-1) = temp
                 b22bulge = work(iv2tsn+i-1-1)*b22e(i)
                 b22e(i) = work(iv2tcs+i-1-1)*b22e(i)
                 ! compute theta(i)
                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
                 theta(i) = atan2( sqrt(y1**2_${ik}$+y2**2_${ik}$), sqrt(x1**2_${ik}$+x2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 =   b11d(i)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12e(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 =   b21d(i)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22e(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_${ri}$lartgp( x2, x1, work(iu1sn+i-1), work(iu1cs+i-1),r )
                 else if( .not. restart11 .and. restart12 ) then
                    call stdlib${ii}$_${ri}$lartgp( b11bulge, b11d(i), work(iu1sn+i-1),work(iu1cs+i-1), r )
                              
                 else if( restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_${ri}$lartgp( b12bulge, b12e(i-1), work(iu1sn+i-1),work(iu1cs+i-1), r )
                              
                 else if( mu <= nu ) then
                    call stdlib${ii}$_${ri}$lartgs( b11e(i), b11d(i+1), mu, work(iu1cs+i-1),work(iu1sn+i-1) )
                              
                 else
                    call stdlib${ii}$_${ri}$lartgs( b12d(i), b12e(i), nu, work(iu1cs+i-1),work(iu1sn+i-1) )
                              
                 end if
                 if( .not. restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_${ri}$lartgp( y2, y1, work(iu2sn+i-1), work(iu2cs+i-1),r )
                 else if( .not. restart21 .and. restart22 ) then
                    call stdlib${ii}$_${ri}$lartgp( b21bulge, b21d(i), work(iu2sn+i-1),work(iu2cs+i-1), r )
                              
                 else if( restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_${ri}$lartgp( b22bulge, b22e(i-1), work(iu2sn+i-1),work(iu2cs+i-1), r )
                              
                 else if( nu < mu ) then
                    call stdlib${ii}$_${ri}$lartgs( b21e(i), b21e(i+1), nu, work(iu2cs+i-1),work(iu2sn+i-1) )
                              
                 else
                    call stdlib${ii}$_${ri}$lartgs( b22d(i), b22e(i), mu, work(iu2cs+i-1),work(iu2sn+i-1) )
                              
                 end if
                 work(iu2cs+i-1) = -work(iu2cs+i-1)
                 work(iu2sn+i-1) = -work(iu2sn+i-1)
                 temp = work(iu1cs+i-1)*b11e(i) + work(iu1sn+i-1)*b11d(i+1)
                 b11d(i+1) = work(iu1cs+i-1)*b11d(i+1) -work(iu1sn+i-1)*b11e(i)
                 b11e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b11bulge = work(iu1sn+i-1)*b11e(i+1)
                    b11e(i+1) = work(iu1cs+i-1)*b11e(i+1)
                 end if
                 temp = work(iu2cs+i-1)*b21e(i) + work(iu2sn+i-1)*b21d(i+1)
                 b21d(i+1) = work(iu2cs+i-1)*b21d(i+1) -work(iu2sn+i-1)*b21e(i)
                 b21e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b21bulge = work(iu2sn+i-1)*b21e(i+1)
                    b21e(i+1) = work(iu2cs+i-1)*b21e(i+1)
                 end if
                 temp = work(iu1cs+i-1)*b12d(i) + work(iu1sn+i-1)*b12e(i)
                 b12e(i) = work(iu1cs+i-1)*b12e(i) - work(iu1sn+i-1)*b12d(i)
                 b12d(i) = temp
                 b12bulge = work(iu1sn+i-1)*b12d(i+1)
                 b12d(i+1) = work(iu1cs+i-1)*b12d(i+1)
                 temp = work(iu2cs+i-1)*b22d(i) + work(iu2sn+i-1)*b22e(i)
                 b22e(i) = work(iu2cs+i-1)*b22e(i) - work(iu2sn+i-1)*b22d(i)
                 b22d(i) = temp
                 b22bulge = work(iu2sn+i-1)*b22d(i+1)
                 b22d(i+1) = work(iu2cs+i-1)*b22d(i+1)
              end do
              ! compute phi(imax-1)
              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
              phi(imax-1) = atan2( abs(x1), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
              restart12 = b12d(imax-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
              restart22 = b22d(imax-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
              if( .not. restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_${ri}$lartgp( y2, y1, work(iv2tsn+imax-1-1),work(iv2tcs+imax-1-1), r )
                           
              else if( .not. restart12 .and. restart22 ) then
                 call stdlib${ii}$_${ri}$lartgp( b12bulge, b12d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
                           imax-1-1), r )
              else if( restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_${ri}$lartgp( b22bulge, b22d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
                           imax-1-1), r )
              else if( nu < mu ) then
                 call stdlib${ii}$_${ri}$lartgs( b12e(imax-1), b12d(imax), nu,work(iv2tcs+imax-1-1), work(&
                           iv2tsn+imax-1-1) )
              else
                 call stdlib${ii}$_${ri}$lartgs( b22e(imax-1), b22d(imax), mu,work(iv2tcs+imax-1-1), work(&
                           iv2tsn+imax-1-1) )
              end if
              temp = work(iv2tcs+imax-1-1)*b12e(imax-1) +work(iv2tsn+imax-1-1)*b12d(imax)
              b12d(imax) = work(iv2tcs+imax-1-1)*b12d(imax) -work(iv2tsn+imax-1-1)*b12e(imax-1)
                        
              b12e(imax-1) = temp
              temp = work(iv2tcs+imax-1-1)*b22e(imax-1) +work(iv2tsn+imax-1-1)*b22d(imax)
              b22d(imax) = work(iv2tcs+imax-1-1)*b22d(imax) -work(iv2tsn+imax-1-1)*b22e(imax-1)
                        
              b22e(imax-1) = temp
              ! update singular vectors
              if( wantu1 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_${ri}$lasr( 'R', 'V', 'F', p, imax-imin+1,work(iu1cs+imin-1), work(&
                              iu1sn+imin-1),u1(1_${ik}$,imin), ldu1 )
                 else
                    call stdlib${ii}$_${ri}$lasr( 'L', 'V', 'F', imax-imin+1, p,work(iu1cs+imin-1), work(&
                              iu1sn+imin-1),u1(imin,1_${ik}$), ldu1 )
                 end if
              end if
              if( wantu2 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_${ri}$lasr( 'R', 'V', 'F', m-p, imax-imin+1,work(iu2cs+imin-1), work(&
                              iu2sn+imin-1),u2(1_${ik}$,imin), ldu2 )
                 else
                    call stdlib${ii}$_${ri}$lasr( 'L', 'V', 'F', imax-imin+1, m-p,work(iu2cs+imin-1), work(&
                              iu2sn+imin-1),u2(imin,1_${ik}$), ldu2 )
                 end if
              end if
              if( wantv1t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_${ri}$lasr( 'L', 'V', 'F', imax-imin+1, q,work(iv1tcs+imin-1), work(&
                              iv1tsn+imin-1),v1t(imin,1_${ik}$), ldv1t )
                 else
                    call stdlib${ii}$_${ri}$lasr( 'R', 'V', 'F', q, imax-imin+1,work(iv1tcs+imin-1), work(&
                              iv1tsn+imin-1),v1t(1_${ik}$,imin), ldv1t )
                 end if
              end if
              if( wantv2t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_${ri}$lasr( 'L', 'V', 'F', imax-imin+1, m-q,work(iv2tcs+imin-1), work(&
                              iv2tsn+imin-1),v2t(imin,1_${ik}$), ldv2t )
                 else
                    call stdlib${ii}$_${ri}$lasr( 'R', 'V', 'F', m-q, imax-imin+1,work(iv2tcs+imin-1), work(&
                              iv2tsn+imin-1),v2t(1_${ik}$,imin), ldv2t )
                 end if
              end if
              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
              if( b11e(imax-1)+b21e(imax-1) > 0_${ik}$ ) then
                 b11d(imax) = -b11d(imax)
                 b21d(imax) = -b21d(imax)
                 if( wantv1t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_${ri}$scal( q, negone, v1t(imax,1_${ik}$), ldv1t )
                    else
                       call stdlib${ii}$_${ri}$scal( q, negone, v1t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! compute theta(imax)
              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
              theta(imax) = atan2( abs(y1), abs(x1) )
              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
              ! and b22(imax,imax-1)
              if( b11d(imax)+b12e(imax-1) < 0_${ik}$ ) then
                 b12d(imax) = -b12d(imax)
                 if( wantu1 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_${ri}$scal( p, negone, u1(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_${ri}$scal( p, negone, u1(imax,1_${ik}$), ldu1 )
                    end if
                 end if
              end if
              if( b21d(imax)+b22e(imax-1) > 0_${ik}$ ) then
                 b22d(imax) = -b22d(imax)
                 if( wantu2 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_${ri}$scal( m-p, negone, u2(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_${ri}$scal( m-p, negone, u2(imax,1_${ik}$), ldu2 )
                    end if
                 end if
              end if
              ! fix signs on b12(imax,imax) and b22(imax,imax)
              if( b12d(imax)+b22d(imax) < 0_${ik}$ ) then
                 if( wantv2t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_${ri}$scal( m-q, negone, v2t(imax,1_${ik}$), ldv2t )
                    else
                       call stdlib${ii}$_${ri}$scal( m-q, negone, v2t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! test for negligible sines or cosines
              do i = imin, imax
                 if( theta(i) < thresh ) then
                    theta(i) = zero
                 else if( theta(i) > piover2-thresh ) then
                    theta(i) = piover2
                 end if
              end do
              do i = imin, imax-1
                 if( phi(i) < thresh ) then
                    phi(i) = zero
                 else if( phi(i) > piover2-thresh ) then
                    phi(i) = piover2
                 end if
              end do
              ! deflate
              if (imax > 1_${ik}$) then
                 do while( phi(imax-1) == zero )
                    imax = imax - 1_${ik}$
                    if (imax <= 1) exit
                 end do
              end if
              if( imin > imax - 1_${ik}$ )imin = imax - 1_${ik}$
              if (imin > 1_${ik}$) then
                 do while (phi(imin-1) /= zero)
                     imin = imin - 1_${ik}$
                     if (imin <= 1) exit
                 end do
              end if
              ! repeat main iteration loop
           end do
           ! postprocessing: order theta from least to greatest
           do i = 1, q
              mini = i
              thetamin = theta(i)
              do j = i+1, q
                 if( theta(j) < thetamin ) then
                    mini = j
                    thetamin = theta(j)
                 end if
              end do
              if( mini /= i ) then
                 theta(mini) = theta(i)
                 theta(i) = thetamin
                 if( colmajor ) then
                    if( wantu1 )call stdlib${ii}$_${ri}$swap( p, u1(1_${ik}$,i), 1_${ik}$, u1(1_${ik}$,mini), 1_${ik}$ )
                    if( wantu2 )call stdlib${ii}$_${ri}$swap( m-p, u2(1_${ik}$,i), 1_${ik}$, u2(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv1t )call stdlib${ii}$_${ri}$swap( q, v1t(i,1_${ik}$), ldv1t, v1t(mini,1_${ik}$), ldv1t )
                              
                    if( wantv2t )call stdlib${ii}$_${ri}$swap( m-q, v2t(i,1_${ik}$), ldv2t, v2t(mini,1_${ik}$),ldv2t )
                              
                 else
                    if( wantu1 )call stdlib${ii}$_${ri}$swap( p, u1(i,1_${ik}$), ldu1, u1(mini,1_${ik}$), ldu1 )
                    if( wantu2 )call stdlib${ii}$_${ri}$swap( m-p, u2(i,1_${ik}$), ldu2, u2(mini,1_${ik}$), ldu2 )
                    if( wantv1t )call stdlib${ii}$_${ri}$swap( q, v1t(1_${ik}$,i), 1_${ik}$, v1t(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv2t )call stdlib${ii}$_${ri}$swap( m-q, v2t(1_${ik}$,i), 1_${ik}$, v2t(1_${ik}$,mini), 1_${ik}$ )
                 end if
              end if
           end do
           return
     end subroutine stdlib${ii}$_${ri}$bbcsd

#:endif
#:endfor

     pure module subroutine stdlib${ii}$_cbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, rwork, &
               lrwork, info )
     !! CBBCSD computes the CS decomposition of a unitary matrix in
     !! bidiagonal-block form,
     !! [ B11 | B12 0  0 ]
     !! [  0  |  0 -I  0 ]
     !! X = [----------------]
     !! [ B21 | B22 0  0 ]
     !! [  0  |  0  0  I ]
     !! [  C | -S  0  0 ]
     !! [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**H
     !! = [---------] [---------------] [---------]   .
     !! [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
     !! [  0 |  0  0  I ]
     !! X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
     !! than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
     !! transposed and/or permuted. This can be done in constant time using
     !! the TRANS and SIGNS options. See CUNCSD for details.)
     !! The bidiagonal matrices B11, B12, B21, and B22 are represented
     !! implicitly by angles THETA(1:Q) and PHI(1:Q-1).
     !! The unitary matrices U1, U2, V1T, and V2T are input/output.
     !! The input matrices are pre- or post-multiplied by the appropriate
     !! singular vector matrices.
     
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, one, ten, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lrwork, m, p, q
           ! Array Arguments 
           real(sp), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
                      b22e(*), rwork(*)
           real(sp), intent(inout) :: phi(*), theta(*)
           complex(sp), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
                     
        ! ===================================================================
           ! Parameters 
           integer(${ik}$), parameter :: maxitr = 6_${ik}$
           real(sp), parameter :: hundred = 100.0_sp
           real(sp), parameter :: meighth = -0.125_sp
           real(sp), parameter :: piover2 = 1.57079632679489661923132169163975144210_sp
           
           
           
           
           ! Local Scalars 
           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
                     wantu2, wantv1t, wantv2t
           integer(${ik}$) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
                     iv2tcs, iv2tsn, j, lrworkmin, lrworkopt, maxit, mini
           real(sp) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
           ! Intrinsic Functions 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lrwork == -1_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           if( m < 0_${ik}$ ) then
              info = -6_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -7_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -8_${ik}$
           else if( q > p .or. q > m-p .or. q > m-q ) then
              info = -8_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -12_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -14_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -16_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -18_${ik}$
           end if
           ! quick return if q = 0
           if( info == 0_${ik}$ .and. q == 0_${ik}$ ) then
              lrworkmin = 1_${ik}$
              rwork(1_${ik}$) = lrworkmin
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              iu1cs = 1_${ik}$
              iu1sn = iu1cs + q
              iu2cs = iu1sn + q
              iu2sn = iu2cs + q
              iv1tcs = iu2sn + q
              iv1tsn = iv1tcs + q
              iv2tcs = iv1tsn + q
              iv2tsn = iv2tcs + q
              lrworkopt = iv2tsn + q - 1_${ik}$
              lrworkmin = lrworkopt
              rwork(1_${ik}$) = lrworkopt
              if( lrwork < lrworkmin .and. .not. lquery ) then
                 info = -28_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'CBBCSD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! get machine constants
           eps = stdlib${ii}$_slamch( 'EPSILON' )
           unfl = stdlib${ii}$_slamch( 'SAFE MINIMUM' )
           tolmul = max( ten, min( hundred, eps**meighth ) )
           tol = tolmul*eps
           thresh = max( tol, maxitr*q*q*unfl )
           ! test for negligible sines or cosines
           do i = 1, q
              if( theta(i) < thresh ) then
                 theta(i) = zero
              else if( theta(i) > piover2-thresh ) then
                 theta(i) = piover2
              end if
           end do
           do i = 1, q-1
              if( phi(i) < thresh ) then
                 phi(i) = zero
              else if( phi(i) > piover2-thresh ) then
                 phi(i) = piover2
              end if
           end do
           ! initial deflation
           imax = q
           do while( imax > 1 )
              if( phi(imax-1) /= zero ) then
                 exit
              end if
              imax = imax - 1_${ik}$
           end do
           imin = imax - 1_${ik}$
           if  ( imin > 1_${ik}$ ) then
              do while( phi(imin-1) /= zero )
                 imin = imin - 1_${ik}$
                 if  ( imin <= 1 ) exit
              end do
           end if
           ! initialize iteration counter
           maxit = maxitr*q*q
           iter = 0_${ik}$
           ! begin main iteration loop
           do while( imax > 1 )
              ! compute the matrix entries
              b11d(imin) = cos( theta(imin) )
              b21d(imin) = -sin( theta(imin) )
              do i = imin, imax - 1
                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
              end do
              b12d(imax) = sin( theta(imax) )
              b22d(imax) = cos( theta(imax) )
              ! abort if not converging; otherwise, increment iter
              if( iter > maxit ) then
                 info = 0_${ik}$
                 do i = 1, q
                    if( phi(i) /= zero )info = info + 1_${ik}$
                 end do
                 return
              end if
              iter = iter + imax - imin
              ! compute shifts
              thetamax = theta(imin)
              thetamin = theta(imin)
              do i = imin+1, imax
                 if( theta(i) > thetamax )thetamax = theta(i)
                 if( theta(i) < thetamin )thetamin = theta(i)
              end do
              if( thetamax > piover2 - thresh ) then
                 ! zero on diagonals of b11 and b22; induce deflation with a
                 ! zero shift
                 mu = zero
                 nu = one
              else if( thetamin < thresh ) then
                 ! zero on diagonals of b12 and b22; induce deflation with a
                 ! zero shift
                 mu = one
                 nu = zero
              else
                 ! compute shifts for b11 and b21 and use the lesser
                 call stdlib${ii}$_slas2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
                           
                 call stdlib${ii}$_slas2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
                           
                 if( sigma11 <= sigma21 ) then
                    mu = sigma11
                    nu = sqrt( one - mu**2_${ik}$ )
                    if( mu < thresh ) then
                       mu = zero
                       nu = one
                    end if
                 else
                    nu = sigma21
                    mu = sqrt( one - nu**2_${ik}$ )
                    if( nu < thresh ) then
                       mu = one
                       nu = zero
                    end if
                 end if
              end if
              ! rotate to produce bulges in b11 and b21
              if( mu <= nu ) then
                 call stdlib${ii}$_slartgs( b11d(imin), b11e(imin), mu,rwork(iv1tcs+imin-1), rwork(&
                           iv1tsn+imin-1) )
              else
                 call stdlib${ii}$_slartgs( b21d(imin), b21e(imin), nu,rwork(iv1tcs+imin-1), rwork(&
                           iv1tsn+imin-1) )
              end if
              temp = rwork(iv1tcs+imin-1)*b11d(imin) +rwork(iv1tsn+imin-1)*b11e(imin)
              b11e(imin) = rwork(iv1tcs+imin-1)*b11e(imin) -rwork(iv1tsn+imin-1)*b11d(imin)
                        
              b11d(imin) = temp
              b11bulge = rwork(iv1tsn+imin-1)*b11d(imin+1)
              b11d(imin+1) = rwork(iv1tcs+imin-1)*b11d(imin+1)
              temp = rwork(iv1tcs+imin-1)*b21d(imin) +rwork(iv1tsn+imin-1)*b21e(imin)
              b21e(imin) = rwork(iv1tcs+imin-1)*b21e(imin) -rwork(iv1tsn+imin-1)*b21d(imin)
                        
              b21d(imin) = temp
              b21bulge = rwork(iv1tsn+imin-1)*b21d(imin+1)
              b21d(imin+1) = rwork(iv1tcs+imin-1)*b21d(imin+1)
              ! compute theta(imin)
              theta( imin ) = atan2( sqrt( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ ),sqrt( b11d(imin)**2_${ik}$+&
                        b11bulge**2_${ik}$ ) )
              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
              if( b11d(imin)**2_${ik}$+b11bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_slartgp( b11bulge, b11d(imin), rwork(iu1sn+imin-1),rwork(iu1cs+imin-&
                           1_${ik}$), r )
              else if( mu <= nu ) then
                 call stdlib${ii}$_slartgs( b11e( imin ), b11d( imin + 1_${ik}$ ), mu,rwork(iu1cs+imin-1), &
                           rwork(iu1sn+imin-1) )
              else
                 call stdlib${ii}$_slartgs( b12d( imin ), b12e( imin ), nu,rwork(iu1cs+imin-1), rwork(&
                           iu1sn+imin-1) )
              end if
              if( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_slartgp( b21bulge, b21d(imin), rwork(iu2sn+imin-1),rwork(iu2cs+imin-&
                           1_${ik}$), r )
              else if( nu < mu ) then
                 call stdlib${ii}$_slartgs( b21e( imin ), b21d( imin + 1_${ik}$ ), nu,rwork(iu2cs+imin-1), &
                           rwork(iu2sn+imin-1) )
              else
                 call stdlib${ii}$_slartgs( b22d(imin), b22e(imin), mu,rwork(iu2cs+imin-1), rwork(iu2sn+&
                           imin-1) )
              end if
              rwork(iu2cs+imin-1) = -rwork(iu2cs+imin-1)
              rwork(iu2sn+imin-1) = -rwork(iu2sn+imin-1)
              temp = rwork(iu1cs+imin-1)*b11e(imin) +rwork(iu1sn+imin-1)*b11d(imin+1)
              b11d(imin+1) = rwork(iu1cs+imin-1)*b11d(imin+1) -rwork(iu1sn+imin-1)*b11e(imin)
                        
              b11e(imin) = temp
              if( imax > imin+1 ) then
                 b11bulge = rwork(iu1sn+imin-1)*b11e(imin+1)
                 b11e(imin+1) = rwork(iu1cs+imin-1)*b11e(imin+1)
              end if
              temp = rwork(iu1cs+imin-1)*b12d(imin) +rwork(iu1sn+imin-1)*b12e(imin)
              b12e(imin) = rwork(iu1cs+imin-1)*b12e(imin) -rwork(iu1sn+imin-1)*b12d(imin)
              b12d(imin) = temp
              b12bulge = rwork(iu1sn+imin-1)*b12d(imin+1)
              b12d(imin+1) = rwork(iu1cs+imin-1)*b12d(imin+1)
              temp = rwork(iu2cs+imin-1)*b21e(imin) +rwork(iu2sn+imin-1)*b21d(imin+1)
              b21d(imin+1) = rwork(iu2cs+imin-1)*b21d(imin+1) -rwork(iu2sn+imin-1)*b21e(imin)
                        
              b21e(imin) = temp
              if( imax > imin+1 ) then
                 b21bulge = rwork(iu2sn+imin-1)*b21e(imin+1)
                 b21e(imin+1) = rwork(iu2cs+imin-1)*b21e(imin+1)
              end if
              temp = rwork(iu2cs+imin-1)*b22d(imin) +rwork(iu2sn+imin-1)*b22e(imin)
              b22e(imin) = rwork(iu2cs+imin-1)*b22e(imin) -rwork(iu2sn+imin-1)*b22d(imin)
              b22d(imin) = temp
              b22bulge = rwork(iu2sn+imin-1)*b22d(imin+1)
              b22d(imin+1) = rwork(iu2cs+imin-1)*b22d(imin+1)
              ! inner loop: chase bulges from b11(imin,imin+2),
              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
              ! bottom-right
              do i = imin+1, imax-1
                 ! compute phi(i-1)
                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
                 phi(i-1) = atan2( sqrt(x1**2_${ik}$+x2**2_${ik}$), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 = b11e(i-1)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 = b21e(i-1)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12d(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22d(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_slartgp( x2, x1, rwork(iv1tsn+i-1),rwork(iv1tcs+i-1), r )
                 else if( .not. restart11 .and. restart21 ) then
                    call stdlib${ii}$_slartgp( b11bulge, b11e(i-1), rwork(iv1tsn+i-1),rwork(iv1tcs+i-1),&
                               r )
                 else if( restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_slartgp( b21bulge, b21e(i-1), rwork(iv1tsn+i-1),rwork(iv1tcs+i-1),&
                               r )
                 else if( mu <= nu ) then
                    call stdlib${ii}$_slartgs( b11d(i), b11e(i), mu, rwork(iv1tcs+i-1),rwork(iv1tsn+i-1)&
                               )
                 else
                    call stdlib${ii}$_slartgs( b21d(i), b21e(i), nu, rwork(iv1tcs+i-1),rwork(iv1tsn+i-1)&
                               )
                 end if
                 rwork(iv1tcs+i-1) = -rwork(iv1tcs+i-1)
                 rwork(iv1tsn+i-1) = -rwork(iv1tsn+i-1)
                 if( .not. restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( y2, y1, rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-1-1), r )
                              
                 else if( .not. restart12 .and. restart22 ) then
                    call stdlib${ii}$_slartgp( b12bulge, b12d(i-1), rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-&
                              1_${ik}$-1), r )
                 else if( restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( b22bulge, b22d(i-1), rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-&
                              1_${ik}$-1), r )
                 else if( nu < mu ) then
                    call stdlib${ii}$_slartgs( b12e(i-1), b12d(i), nu,rwork(iv2tcs+i-1-1), rwork(iv2tsn+&
                              i-1-1) )
                 else
                    call stdlib${ii}$_slartgs( b22e(i-1), b22d(i), mu,rwork(iv2tcs+i-1-1), rwork(iv2tsn+&
                              i-1-1) )
                 end if
                 temp = rwork(iv1tcs+i-1)*b11d(i) + rwork(iv1tsn+i-1)*b11e(i)
                 b11e(i) = rwork(iv1tcs+i-1)*b11e(i) -rwork(iv1tsn+i-1)*b11d(i)
                 b11d(i) = temp
                 b11bulge = rwork(iv1tsn+i-1)*b11d(i+1)
                 b11d(i+1) = rwork(iv1tcs+i-1)*b11d(i+1)
                 temp = rwork(iv1tcs+i-1)*b21d(i) + rwork(iv1tsn+i-1)*b21e(i)
                 b21e(i) = rwork(iv1tcs+i-1)*b21e(i) -rwork(iv1tsn+i-1)*b21d(i)
                 b21d(i) = temp
                 b21bulge = rwork(iv1tsn+i-1)*b21d(i+1)
                 b21d(i+1) = rwork(iv1tcs+i-1)*b21d(i+1)
                 temp = rwork(iv2tcs+i-1-1)*b12e(i-1) +rwork(iv2tsn+i-1-1)*b12d(i)
                 b12d(i) = rwork(iv2tcs+i-1-1)*b12d(i) -rwork(iv2tsn+i-1-1)*b12e(i-1)
                 b12e(i-1) = temp
                 b12bulge = rwork(iv2tsn+i-1-1)*b12e(i)
                 b12e(i) = rwork(iv2tcs+i-1-1)*b12e(i)
                 temp = rwork(iv2tcs+i-1-1)*b22e(i-1) +rwork(iv2tsn+i-1-1)*b22d(i)
                 b22d(i) = rwork(iv2tcs+i-1-1)*b22d(i) -rwork(iv2tsn+i-1-1)*b22e(i-1)
                 b22e(i-1) = temp
                 b22bulge = rwork(iv2tsn+i-1-1)*b22e(i)
                 b22e(i) = rwork(iv2tcs+i-1-1)*b22e(i)
                 ! compute theta(i)
                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
                 theta(i) = atan2( sqrt(y1**2_${ik}$+y2**2_${ik}$), sqrt(x1**2_${ik}$+x2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 =   b11d(i)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12e(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 =   b21d(i)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22e(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_slartgp( x2, x1, rwork(iu1sn+i-1), rwork(iu1cs+i-1),r )
                 else if( .not. restart11 .and. restart12 ) then
                    call stdlib${ii}$_slartgp( b11bulge, b11d(i), rwork(iu1sn+i-1),rwork(iu1cs+i-1), r )
                              
                 else if( restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_slartgp( b12bulge, b12e(i-1), rwork(iu1sn+i-1),rwork(iu1cs+i-1), &
                              r )
                 else if( mu <= nu ) then
                    call stdlib${ii}$_slartgs( b11e(i), b11d(i+1), mu, rwork(iu1cs+i-1),rwork(iu1sn+i-1)&
                               )
                 else
                    call stdlib${ii}$_slartgs( b12d(i), b12e(i), nu, rwork(iu1cs+i-1),rwork(iu1sn+i-1) )
                              
                 end if
                 if( .not. restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( y2, y1, rwork(iu2sn+i-1), rwork(iu2cs+i-1),r )
                 else if( .not. restart21 .and. restart22 ) then
                    call stdlib${ii}$_slartgp( b21bulge, b21d(i), rwork(iu2sn+i-1),rwork(iu2cs+i-1), r )
                              
                 else if( restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( b22bulge, b22e(i-1), rwork(iu2sn+i-1),rwork(iu2cs+i-1), &
                              r )
                 else if( nu < mu ) then
                    call stdlib${ii}$_slartgs( b21e(i), b21e(i+1), nu, rwork(iu2cs+i-1),rwork(iu2sn+i-1)&
                               )
                 else
                    call stdlib${ii}$_slartgs( b22d(i), b22e(i), mu, rwork(iu2cs+i-1),rwork(iu2sn+i-1) )
                              
                 end if
                 rwork(iu2cs+i-1) = -rwork(iu2cs+i-1)
                 rwork(iu2sn+i-1) = -rwork(iu2sn+i-1)
                 temp = rwork(iu1cs+i-1)*b11e(i) + rwork(iu1sn+i-1)*b11d(i+1)
                 b11d(i+1) = rwork(iu1cs+i-1)*b11d(i+1) -rwork(iu1sn+i-1)*b11e(i)
                 b11e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b11bulge = rwork(iu1sn+i-1)*b11e(i+1)
                    b11e(i+1) = rwork(iu1cs+i-1)*b11e(i+1)
                 end if
                 temp = rwork(iu2cs+i-1)*b21e(i) + rwork(iu2sn+i-1)*b21d(i+1)
                 b21d(i+1) = rwork(iu2cs+i-1)*b21d(i+1) -rwork(iu2sn+i-1)*b21e(i)
                 b21e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b21bulge = rwork(iu2sn+i-1)*b21e(i+1)
                    b21e(i+1) = rwork(iu2cs+i-1)*b21e(i+1)
                 end if
                 temp = rwork(iu1cs+i-1)*b12d(i) + rwork(iu1sn+i-1)*b12e(i)
                 b12e(i) = rwork(iu1cs+i-1)*b12e(i) -rwork(iu1sn+i-1)*b12d(i)
                 b12d(i) = temp
                 b12bulge = rwork(iu1sn+i-1)*b12d(i+1)
                 b12d(i+1) = rwork(iu1cs+i-1)*b12d(i+1)
                 temp = rwork(iu2cs+i-1)*b22d(i) + rwork(iu2sn+i-1)*b22e(i)
                 b22e(i) = rwork(iu2cs+i-1)*b22e(i) -rwork(iu2sn+i-1)*b22d(i)
                 b22d(i) = temp
                 b22bulge = rwork(iu2sn+i-1)*b22d(i+1)
                 b22d(i+1) = rwork(iu2cs+i-1)*b22d(i+1)
              end do
              ! compute phi(imax-1)
              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
              phi(imax-1) = atan2( abs(x1), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
              restart12 = b12d(imax-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
              restart22 = b22d(imax-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
              if( .not. restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_slartgp( y2, y1, rwork(iv2tsn+imax-1-1),rwork(iv2tcs+imax-1-1), r )
                           
              else if( .not. restart12 .and. restart22 ) then
                 call stdlib${ii}$_slartgp( b12bulge, b12d(imax-1),rwork(iv2tsn+imax-1-1),rwork(iv2tcs+&
                           imax-1-1), r )
              else if( restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_slartgp( b22bulge, b22d(imax-1),rwork(iv2tsn+imax-1-1),rwork(iv2tcs+&
                           imax-1-1), r )
              else if( nu < mu ) then
                 call stdlib${ii}$_slartgs( b12e(imax-1), b12d(imax), nu,rwork(iv2tcs+imax-1-1),rwork(&
                           iv2tsn+imax-1-1) )
              else
                 call stdlib${ii}$_slartgs( b22e(imax-1), b22d(imax), mu,rwork(iv2tcs+imax-1-1),rwork(&
                           iv2tsn+imax-1-1) )
              end if
              temp = rwork(iv2tcs+imax-1-1)*b12e(imax-1) +rwork(iv2tsn+imax-1-1)*b12d(imax)
                        
              b12d(imax) = rwork(iv2tcs+imax-1-1)*b12d(imax) -rwork(iv2tsn+imax-1-1)*b12e(imax-1)
                        
              b12e(imax-1) = temp
              temp = rwork(iv2tcs+imax-1-1)*b22e(imax-1) +rwork(iv2tsn+imax-1-1)*b22d(imax)
                        
              b22d(imax) = rwork(iv2tcs+imax-1-1)*b22d(imax) -rwork(iv2tsn+imax-1-1)*b22e(imax-1)
                        
              b22e(imax-1) = temp
              ! update singular vectors
              if( wantu1 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_clasr( 'R', 'V', 'F', p, imax-imin+1,rwork(iu1cs+imin-1), rwork(&
                              iu1sn+imin-1),u1(1_${ik}$,imin), ldu1 )
                 else
                    call stdlib${ii}$_clasr( 'L', 'V', 'F', imax-imin+1, p,rwork(iu1cs+imin-1), rwork(&
                              iu1sn+imin-1),u1(imin,1_${ik}$), ldu1 )
                 end if
              end if
              if( wantu2 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_clasr( 'R', 'V', 'F', m-p, imax-imin+1,rwork(iu2cs+imin-1), rwork(&
                              iu2sn+imin-1),u2(1_${ik}$,imin), ldu2 )
                 else
                    call stdlib${ii}$_clasr( 'L', 'V', 'F', imax-imin+1, m-p,rwork(iu2cs+imin-1), rwork(&
                              iu2sn+imin-1),u2(imin,1_${ik}$), ldu2 )
                 end if
              end if
              if( wantv1t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_clasr( 'L', 'V', 'F', imax-imin+1, q,rwork(iv1tcs+imin-1), rwork(&
                              iv1tsn+imin-1),v1t(imin,1_${ik}$), ldv1t )
                 else
                    call stdlib${ii}$_clasr( 'R', 'V', 'F', q, imax-imin+1,rwork(iv1tcs+imin-1), rwork(&
                              iv1tsn+imin-1),v1t(1_${ik}$,imin), ldv1t )
                 end if
              end if
              if( wantv2t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_clasr( 'L', 'V', 'F', imax-imin+1, m-q,rwork(iv2tcs+imin-1), &
                              rwork(iv2tsn+imin-1),v2t(imin,1_${ik}$), ldv2t )
                 else
                    call stdlib${ii}$_clasr( 'R', 'V', 'F', m-q, imax-imin+1,rwork(iv2tcs+imin-1), &
                              rwork(iv2tsn+imin-1),v2t(1_${ik}$,imin), ldv2t )
                 end if
              end if
              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
              if( b11e(imax-1)+b21e(imax-1) > 0_${ik}$ ) then
                 b11d(imax) = -b11d(imax)
                 b21d(imax) = -b21d(imax)
                 if( wantv1t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_cscal( q, cnegone, v1t(imax,1_${ik}$), ldv1t )
                    else
                       call stdlib${ii}$_cscal( q, cnegone, v1t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! compute theta(imax)
              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
              theta(imax) = atan2( abs(y1), abs(x1) )
              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
              ! and b22(imax,imax-1)
              if( b11d(imax)+b12e(imax-1) < 0_${ik}$ ) then
                 b12d(imax) = -b12d(imax)
                 if( wantu1 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_cscal( p, cnegone, u1(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_cscal( p, cnegone, u1(imax,1_${ik}$), ldu1 )
                    end if
                 end if
              end if
              if( b21d(imax)+b22e(imax-1) > 0_${ik}$ ) then
                 b22d(imax) = -b22d(imax)
                 if( wantu2 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_cscal( m-p, cnegone, u2(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_cscal( m-p, cnegone, u2(imax,1_${ik}$), ldu2 )
                    end if
                 end if
              end if
              ! fix signs on b12(imax,imax) and b22(imax,imax)
              if( b12d(imax)+b22d(imax) < 0_${ik}$ ) then
                 if( wantv2t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_cscal( m-q, cnegone, v2t(imax,1_${ik}$), ldv2t )
                    else
                       call stdlib${ii}$_cscal( m-q, cnegone, v2t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! test for negligible sines or cosines
              do i = imin, imax
                 if( theta(i) < thresh ) then
                    theta(i) = zero
                 else if( theta(i) > piover2-thresh ) then
                    theta(i) = piover2
                 end if
              end do
              do i = imin, imax-1
                 if( phi(i) < thresh ) then
                    phi(i) = zero
                 else if( phi(i) > piover2-thresh ) then
                    phi(i) = piover2
                 end if
              end do
              ! deflate
              if (imax > 1_${ik}$) then
                 do while( phi(imax-1) == zero )
                    imax = imax - 1_${ik}$
                    if (imax <= 1) exit
                 end do
              end if
              if( imin > imax - 1_${ik}$ )imin = imax - 1_${ik}$
              if (imin > 1_${ik}$) then
                 do while (phi(imin-1) /= zero)
                     imin = imin - 1_${ik}$
                     if (imin <= 1) exit
                 end do
              end if
              ! repeat main iteration loop
           end do
           ! postprocessing: order theta from least to greatest
           do i = 1, q
              mini = i
              thetamin = theta(i)
              do j = i+1, q
                 if( theta(j) < thetamin ) then
                    mini = j
                    thetamin = theta(j)
                 end if
              end do
              if( mini /= i ) then
                 theta(mini) = theta(i)
                 theta(i) = thetamin
                 if( colmajor ) then
                    if( wantu1 )call stdlib${ii}$_cswap( p, u1(1_${ik}$,i), 1_${ik}$, u1(1_${ik}$,mini), 1_${ik}$ )
                    if( wantu2 )call stdlib${ii}$_cswap( m-p, u2(1_${ik}$,i), 1_${ik}$, u2(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv1t )call stdlib${ii}$_cswap( q, v1t(i,1_${ik}$), ldv1t, v1t(mini,1_${ik}$), ldv1t )
                              
                    if( wantv2t )call stdlib${ii}$_cswap( m-q, v2t(i,1_${ik}$), ldv2t, v2t(mini,1_${ik}$),ldv2t )
                              
                 else
                    if( wantu1 )call stdlib${ii}$_cswap( p, u1(i,1_${ik}$), ldu1, u1(mini,1_${ik}$), ldu1 )
                    if( wantu2 )call stdlib${ii}$_cswap( m-p, u2(i,1_${ik}$), ldu2, u2(mini,1_${ik}$), ldu2 )
                    if( wantv1t )call stdlib${ii}$_cswap( q, v1t(1_${ik}$,i), 1_${ik}$, v1t(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv2t )call stdlib${ii}$_cswap( m-q, v2t(1_${ik}$,i), 1_${ik}$, v2t(1_${ik}$,mini), 1_${ik}$ )
                 end if
              end if
           end do
           return
     end subroutine stdlib${ii}$_cbbcsd

     pure module subroutine stdlib${ii}$_zbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
     !! ZBBCSD computes the CS decomposition of a unitary matrix in
     !! bidiagonal-block form,
     !! [ B11 | B12 0  0 ]
     !! [  0  |  0 -I  0 ]
     !! X = [----------------]
     !! [ B21 | B22 0  0 ]
     !! [  0  |  0  0  I ]
     !! [  C | -S  0  0 ]
     !! [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**H
     !! = [---------] [---------------] [---------]   .
     !! [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
     !! [  0 |  0  0  I ]
     !! X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
     !! than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
     !! transposed and/or permuted. This can be done in constant time using
     !! the TRANS and SIGNS options. See ZUNCSD for details.)
     !! The bidiagonal matrices B11, B12, B21, and B22 are represented
     !! implicitly by angles THETA(1:Q) and PHI(1:Q-1).
     !! The unitary matrices U1, U2, V1T, and V2T are input/output.
     !! The input matrices are pre- or post-multiplied by the appropriate
     !! singular vector matrices.
     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, rwork, &
               lrwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, one, ten, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lrwork, m, p, q
           ! Array Arguments 
           real(dp), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
                      b22e(*), rwork(*)
           real(dp), intent(inout) :: phi(*), theta(*)
           complex(dp), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
                     
        ! ===================================================================
           ! Parameters 
           integer(${ik}$), parameter :: maxitr = 6_${ik}$
           real(dp), parameter :: hundred = 100.0_dp
           real(dp), parameter :: meighth = -0.125_dp
           real(dp), parameter :: piover2 = 1.57079632679489661923132169163975144210_dp
           
           
           
           
           ! Local Scalars 
           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
                     wantu2, wantv1t, wantv2t
           integer(${ik}$) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
                     iv2tcs, iv2tsn, j, lrworkmin, lrworkopt, maxit, mini
           real(dp) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
           ! Intrinsic Functions 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lrwork == -1_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           if( m < 0_${ik}$ ) then
              info = -6_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -7_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -8_${ik}$
           else if( q > p .or. q > m-p .or. q > m-q ) then
              info = -8_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -12_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -14_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -16_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -18_${ik}$
           end if
           ! quick return if q = 0
           if( info == 0_${ik}$ .and. q == 0_${ik}$ ) then
              lrworkmin = 1_${ik}$
              rwork(1_${ik}$) = lrworkmin
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              iu1cs = 1_${ik}$
              iu1sn = iu1cs + q
              iu2cs = iu1sn + q
              iu2sn = iu2cs + q
              iv1tcs = iu2sn + q
              iv1tsn = iv1tcs + q
              iv2tcs = iv1tsn + q
              iv2tsn = iv2tcs + q
              lrworkopt = iv2tsn + q - 1_${ik}$
              lrworkmin = lrworkopt
              rwork(1_${ik}$) = lrworkopt
              if( lrwork < lrworkmin .and. .not. lquery ) then
                 info = -28_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZBBCSD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! get machine constants
           eps = stdlib${ii}$_dlamch( 'EPSILON' )
           unfl = stdlib${ii}$_dlamch( 'SAFE MINIMUM' )
           tolmul = max( ten, min( hundred, eps**meighth ) )
           tol = tolmul*eps
           thresh = max( tol, maxitr*q*q*unfl )
           ! test for negligible sines or cosines
           do i = 1, q
              if( theta(i) < thresh ) then
                 theta(i) = zero
              else if( theta(i) > piover2-thresh ) then
                 theta(i) = piover2
              end if
           end do
           do i = 1, q-1
              if( phi(i) < thresh ) then
                 phi(i) = zero
              else if( phi(i) > piover2-thresh ) then
                 phi(i) = piover2
              end if
           end do
           ! initial deflation
           imax = q
           do while( imax > 1 )
              if( phi(imax-1) /= zero ) then
                 exit
              end if
              imax = imax - 1_${ik}$
           end do
           imin = imax - 1_${ik}$
           if  ( imin > 1_${ik}$ ) then
              do while( phi(imin-1) /= zero )
                 imin = imin - 1_${ik}$
                 if  ( imin <= 1 ) exit
              end do
           end if
           ! initialize iteration counter
           maxit = maxitr*q*q
           iter = 0_${ik}$
           ! begin main iteration loop
           do while( imax > 1 )
              ! compute the matrix entries
              b11d(imin) = cos( theta(imin) )
              b21d(imin) = -sin( theta(imin) )
              do i = imin, imax - 1
                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
              end do
              b12d(imax) = sin( theta(imax) )
              b22d(imax) = cos( theta(imax) )
              ! abort if not converging; otherwise, increment iter
              if( iter > maxit ) then
                 info = 0_${ik}$
                 do i = 1, q
                    if( phi(i) /= zero )info = info + 1_${ik}$
                 end do
                 return
              end if
              iter = iter + imax - imin
              ! compute shifts
              thetamax = theta(imin)
              thetamin = theta(imin)
              do i = imin+1, imax
                 if( theta(i) > thetamax )thetamax = theta(i)
                 if( theta(i) < thetamin )thetamin = theta(i)
              end do
              if( thetamax > piover2 - thresh ) then
                 ! zero on diagonals of b11 and b22; induce deflation with a
                 ! zero shift
                 mu = zero
                 nu = one
              else if( thetamin < thresh ) then
                 ! zero on diagonals of b12 and b22; induce deflation with a
                 ! zero shift
                 mu = one
                 nu = zero
              else
                 ! compute shifts for b11 and b21 and use the lesser
                 call stdlib${ii}$_dlas2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
                           
                 call stdlib${ii}$_dlas2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
                           
                 if( sigma11 <= sigma21 ) then
                    mu = sigma11
                    nu = sqrt( one - mu**2_${ik}$ )
                    if( mu < thresh ) then
                       mu = zero
                       nu = one
                    end if
                 else
                    nu = sigma21
                    mu = sqrt( one - nu**2_${ik}$ )
                    if( nu < thresh ) then
                       mu = one
                       nu = zero
                    end if
                 end if
              end if
              ! rotate to produce bulges in b11 and b21
              if( mu <= nu ) then
                 call stdlib${ii}$_dlartgs( b11d(imin), b11e(imin), mu,rwork(iv1tcs+imin-1), rwork(&
                           iv1tsn+imin-1) )
              else
                 call stdlib${ii}$_dlartgs( b21d(imin), b21e(imin), nu,rwork(iv1tcs+imin-1), rwork(&
                           iv1tsn+imin-1) )
              end if
              temp = rwork(iv1tcs+imin-1)*b11d(imin) +rwork(iv1tsn+imin-1)*b11e(imin)
              b11e(imin) = rwork(iv1tcs+imin-1)*b11e(imin) -rwork(iv1tsn+imin-1)*b11d(imin)
                        
              b11d(imin) = temp
              b11bulge = rwork(iv1tsn+imin-1)*b11d(imin+1)
              b11d(imin+1) = rwork(iv1tcs+imin-1)*b11d(imin+1)
              temp = rwork(iv1tcs+imin-1)*b21d(imin) +rwork(iv1tsn+imin-1)*b21e(imin)
              b21e(imin) = rwork(iv1tcs+imin-1)*b21e(imin) -rwork(iv1tsn+imin-1)*b21d(imin)
                        
              b21d(imin) = temp
              b21bulge = rwork(iv1tsn+imin-1)*b21d(imin+1)
              b21d(imin+1) = rwork(iv1tcs+imin-1)*b21d(imin+1)
              ! compute theta(imin)
              theta( imin ) = atan2( sqrt( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ ),sqrt( b11d(imin)**2_${ik}$+&
                        b11bulge**2_${ik}$ ) )
              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
              if( b11d(imin)**2_${ik}$+b11bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_dlartgp( b11bulge, b11d(imin), rwork(iu1sn+imin-1),rwork(iu1cs+imin-&
                           1_${ik}$), r )
              else if( mu <= nu ) then
                 call stdlib${ii}$_dlartgs( b11e( imin ), b11d( imin + 1_${ik}$ ), mu,rwork(iu1cs+imin-1), &
                           rwork(iu1sn+imin-1) )
              else
                 call stdlib${ii}$_dlartgs( b12d( imin ), b12e( imin ), nu,rwork(iu1cs+imin-1), rwork(&
                           iu1sn+imin-1) )
              end if
              if( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_dlartgp( b21bulge, b21d(imin), rwork(iu2sn+imin-1),rwork(iu2cs+imin-&
                           1_${ik}$), r )
              else if( nu < mu ) then
                 call stdlib${ii}$_dlartgs( b21e( imin ), b21d( imin + 1_${ik}$ ), nu,rwork(iu2cs+imin-1), &
                           rwork(iu2sn+imin-1) )
              else
                 call stdlib${ii}$_dlartgs( b22d(imin), b22e(imin), mu,rwork(iu2cs+imin-1), rwork(iu2sn+&
                           imin-1) )
              end if
              rwork(iu2cs+imin-1) = -rwork(iu2cs+imin-1)
              rwork(iu2sn+imin-1) = -rwork(iu2sn+imin-1)
              temp = rwork(iu1cs+imin-1)*b11e(imin) +rwork(iu1sn+imin-1)*b11d(imin+1)
              b11d(imin+1) = rwork(iu1cs+imin-1)*b11d(imin+1) -rwork(iu1sn+imin-1)*b11e(imin)
                        
              b11e(imin) = temp
              if( imax > imin+1 ) then
                 b11bulge = rwork(iu1sn+imin-1)*b11e(imin+1)
                 b11e(imin+1) = rwork(iu1cs+imin-1)*b11e(imin+1)
              end if
              temp = rwork(iu1cs+imin-1)*b12d(imin) +rwork(iu1sn+imin-1)*b12e(imin)
              b12e(imin) = rwork(iu1cs+imin-1)*b12e(imin) -rwork(iu1sn+imin-1)*b12d(imin)
              b12d(imin) = temp
              b12bulge = rwork(iu1sn+imin-1)*b12d(imin+1)
              b12d(imin+1) = rwork(iu1cs+imin-1)*b12d(imin+1)
              temp = rwork(iu2cs+imin-1)*b21e(imin) +rwork(iu2sn+imin-1)*b21d(imin+1)
              b21d(imin+1) = rwork(iu2cs+imin-1)*b21d(imin+1) -rwork(iu2sn+imin-1)*b21e(imin)
                        
              b21e(imin) = temp
              if( imax > imin+1 ) then
                 b21bulge = rwork(iu2sn+imin-1)*b21e(imin+1)
                 b21e(imin+1) = rwork(iu2cs+imin-1)*b21e(imin+1)
              end if
              temp = rwork(iu2cs+imin-1)*b22d(imin) +rwork(iu2sn+imin-1)*b22e(imin)
              b22e(imin) = rwork(iu2cs+imin-1)*b22e(imin) -rwork(iu2sn+imin-1)*b22d(imin)
              b22d(imin) = temp
              b22bulge = rwork(iu2sn+imin-1)*b22d(imin+1)
              b22d(imin+1) = rwork(iu2cs+imin-1)*b22d(imin+1)
              ! inner loop: chase bulges from b11(imin,imin+2),
              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
              ! bottom-right
              do i = imin+1, imax-1
                 ! compute phi(i-1)
                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
                 phi(i-1) = atan2( sqrt(x1**2_${ik}$+x2**2_${ik}$), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 = b11e(i-1)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 = b21e(i-1)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12d(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22d(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_dlartgp( x2, x1, rwork(iv1tsn+i-1),rwork(iv1tcs+i-1), r )
                 else if( .not. restart11 .and. restart21 ) then
                    call stdlib${ii}$_dlartgp( b11bulge, b11e(i-1), rwork(iv1tsn+i-1),rwork(iv1tcs+i-1),&
                               r )
                 else if( restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_dlartgp( b21bulge, b21e(i-1), rwork(iv1tsn+i-1),rwork(iv1tcs+i-1),&
                               r )
                 else if( mu <= nu ) then
                    call stdlib${ii}$_dlartgs( b11d(i), b11e(i), mu, rwork(iv1tcs+i-1),rwork(iv1tsn+i-1)&
                               )
                 else
                    call stdlib${ii}$_dlartgs( b21d(i), b21e(i), nu, rwork(iv1tcs+i-1),rwork(iv1tsn+i-1)&
                               )
                 end if
                 rwork(iv1tcs+i-1) = -rwork(iv1tcs+i-1)
                 rwork(iv1tsn+i-1) = -rwork(iv1tsn+i-1)
                 if( .not. restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_dlartgp( y2, y1, rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-1-1), r )
                              
                 else if( .not. restart12 .and. restart22 ) then
                    call stdlib${ii}$_dlartgp( b12bulge, b12d(i-1), rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-&
                              1_${ik}$-1), r )
                 else if( restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_dlartgp( b22bulge, b22d(i-1), rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-&
                              1_${ik}$-1), r )
                 else if( nu < mu ) then
                    call stdlib${ii}$_dlartgs( b12e(i-1), b12d(i), nu,rwork(iv2tcs+i-1-1), rwork(iv2tsn+&
                              i-1-1) )
                 else
                    call stdlib${ii}$_dlartgs( b22e(i-1), b22d(i), mu,rwork(iv2tcs+i-1-1), rwork(iv2tsn+&
                              i-1-1) )
                 end if
                 temp = rwork(iv1tcs+i-1)*b11d(i) + rwork(iv1tsn+i-1)*b11e(i)
                 b11e(i) = rwork(iv1tcs+i-1)*b11e(i) -rwork(iv1tsn+i-1)*b11d(i)
                 b11d(i) = temp
                 b11bulge = rwork(iv1tsn+i-1)*b11d(i+1)
                 b11d(i+1) = rwork(iv1tcs+i-1)*b11d(i+1)
                 temp = rwork(iv1tcs+i-1)*b21d(i) + rwork(iv1tsn+i-1)*b21e(i)
                 b21e(i) = rwork(iv1tcs+i-1)*b21e(i) -rwork(iv1tsn+i-1)*b21d(i)
                 b21d(i) = temp
                 b21bulge = rwork(iv1tsn+i-1)*b21d(i+1)
                 b21d(i+1) = rwork(iv1tcs+i-1)*b21d(i+1)
                 temp = rwork(iv2tcs+i-1-1)*b12e(i-1) +rwork(iv2tsn+i-1-1)*b12d(i)
                 b12d(i) = rwork(iv2tcs+i-1-1)*b12d(i) -rwork(iv2tsn+i-1-1)*b12e(i-1)
                 b12e(i-1) = temp
                 b12bulge = rwork(iv2tsn+i-1-1)*b12e(i)
                 b12e(i) = rwork(iv2tcs+i-1-1)*b12e(i)
                 temp = rwork(iv2tcs+i-1-1)*b22e(i-1) +rwork(iv2tsn+i-1-1)*b22d(i)
                 b22d(i) = rwork(iv2tcs+i-1-1)*b22d(i) -rwork(iv2tsn+i-1-1)*b22e(i-1)
                 b22e(i-1) = temp
                 b22bulge = rwork(iv2tsn+i-1-1)*b22e(i)
                 b22e(i) = rwork(iv2tcs+i-1-1)*b22e(i)
                 ! compute theta(i)
                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
                 theta(i) = atan2( sqrt(y1**2_${ik}$+y2**2_${ik}$), sqrt(x1**2_${ik}$+x2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 =   b11d(i)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12e(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 =   b21d(i)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22e(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_dlartgp( x2, x1, rwork(iu1sn+i-1), rwork(iu1cs+i-1),r )
                 else if( .not. restart11 .and. restart12 ) then
                    call stdlib${ii}$_dlartgp( b11bulge, b11d(i), rwork(iu1sn+i-1),rwork(iu1cs+i-1), r )
                              
                 else if( restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_dlartgp( b12bulge, b12e(i-1), rwork(iu1sn+i-1),rwork(iu1cs+i-1), &
                              r )
                 else if( mu <= nu ) then
                    call stdlib${ii}$_dlartgs( b11e(i), b11d(i+1), mu, rwork(iu1cs+i-1),rwork(iu1sn+i-1)&
                               )
                 else
                    call stdlib${ii}$_dlartgs( b12d(i), b12e(i), nu, rwork(iu1cs+i-1),rwork(iu1sn+i-1) )
                              
                 end if
                 if( .not. restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_dlartgp( y2, y1, rwork(iu2sn+i-1), rwork(iu2cs+i-1),r )
                 else if( .not. restart21 .and. restart22 ) then
                    call stdlib${ii}$_dlartgp( b21bulge, b21d(i), rwork(iu2sn+i-1),rwork(iu2cs+i-1), r )
                              
                 else if( restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_dlartgp( b22bulge, b22e(i-1), rwork(iu2sn+i-1),rwork(iu2cs+i-1), &
                              r )
                 else if( nu < mu ) then
                    call stdlib${ii}$_dlartgs( b21e(i), b21e(i+1), nu, rwork(iu2cs+i-1),rwork(iu2sn+i-1)&
                               )
                 else
                    call stdlib${ii}$_dlartgs( b22d(i), b22e(i), mu, rwork(iu2cs+i-1),rwork(iu2sn+i-1) )
                              
                 end if
                 rwork(iu2cs+i-1) = -rwork(iu2cs+i-1)
                 rwork(iu2sn+i-1) = -rwork(iu2sn+i-1)
                 temp = rwork(iu1cs+i-1)*b11e(i) + rwork(iu1sn+i-1)*b11d(i+1)
                 b11d(i+1) = rwork(iu1cs+i-1)*b11d(i+1) -rwork(iu1sn+i-1)*b11e(i)
                 b11e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b11bulge = rwork(iu1sn+i-1)*b11e(i+1)
                    b11e(i+1) = rwork(iu1cs+i-1)*b11e(i+1)
                 end if
                 temp = rwork(iu2cs+i-1)*b21e(i) + rwork(iu2sn+i-1)*b21d(i+1)
                 b21d(i+1) = rwork(iu2cs+i-1)*b21d(i+1) -rwork(iu2sn+i-1)*b21e(i)
                 b21e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b21bulge = rwork(iu2sn+i-1)*b21e(i+1)
                    b21e(i+1) = rwork(iu2cs+i-1)*b21e(i+1)
                 end if
                 temp = rwork(iu1cs+i-1)*b12d(i) + rwork(iu1sn+i-1)*b12e(i)
                 b12e(i) = rwork(iu1cs+i-1)*b12e(i) -rwork(iu1sn+i-1)*b12d(i)
                 b12d(i) = temp
                 b12bulge = rwork(iu1sn+i-1)*b12d(i+1)
                 b12d(i+1) = rwork(iu1cs+i-1)*b12d(i+1)
                 temp = rwork(iu2cs+i-1)*b22d(i) + rwork(iu2sn+i-1)*b22e(i)
                 b22e(i) = rwork(iu2cs+i-1)*b22e(i) -rwork(iu2sn+i-1)*b22d(i)
                 b22d(i) = temp
                 b22bulge = rwork(iu2sn+i-1)*b22d(i+1)
                 b22d(i+1) = rwork(iu2cs+i-1)*b22d(i+1)
              end do
              ! compute phi(imax-1)
              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
              phi(imax-1) = atan2( abs(x1), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
              restart12 = b12d(imax-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
              restart22 = b22d(imax-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
              if( .not. restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_dlartgp( y2, y1, rwork(iv2tsn+imax-1-1),rwork(iv2tcs+imax-1-1), r )
                           
              else if( .not. restart12 .and. restart22 ) then
                 call stdlib${ii}$_dlartgp( b12bulge, b12d(imax-1),rwork(iv2tsn+imax-1-1),rwork(iv2tcs+&
                           imax-1-1), r )
              else if( restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_dlartgp( b22bulge, b22d(imax-1),rwork(iv2tsn+imax-1-1),rwork(iv2tcs+&
                           imax-1-1), r )
              else if( nu < mu ) then
                 call stdlib${ii}$_dlartgs( b12e(imax-1), b12d(imax), nu,rwork(iv2tcs+imax-1-1),rwork(&
                           iv2tsn+imax-1-1) )
              else
                 call stdlib${ii}$_dlartgs( b22e(imax-1), b22d(imax), mu,rwork(iv2tcs+imax-1-1),rwork(&
                           iv2tsn+imax-1-1) )
              end if
              temp = rwork(iv2tcs+imax-1-1)*b12e(imax-1) +rwork(iv2tsn+imax-1-1)*b12d(imax)
                        
              b12d(imax) = rwork(iv2tcs+imax-1-1)*b12d(imax) -rwork(iv2tsn+imax-1-1)*b12e(imax-1)
                        
              b12e(imax-1) = temp
              temp = rwork(iv2tcs+imax-1-1)*b22e(imax-1) +rwork(iv2tsn+imax-1-1)*b22d(imax)
                        
              b22d(imax) = rwork(iv2tcs+imax-1-1)*b22d(imax) -rwork(iv2tsn+imax-1-1)*b22e(imax-1)
                        
              b22e(imax-1) = temp
              ! update singular vectors
              if( wantu1 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_zlasr( 'R', 'V', 'F', p, imax-imin+1,rwork(iu1cs+imin-1), rwork(&
                              iu1sn+imin-1),u1(1_${ik}$,imin), ldu1 )
                 else
                    call stdlib${ii}$_zlasr( 'L', 'V', 'F', imax-imin+1, p,rwork(iu1cs+imin-1), rwork(&
                              iu1sn+imin-1),u1(imin,1_${ik}$), ldu1 )
                 end if
              end if
              if( wantu2 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_zlasr( 'R', 'V', 'F', m-p, imax-imin+1,rwork(iu2cs+imin-1), rwork(&
                              iu2sn+imin-1),u2(1_${ik}$,imin), ldu2 )
                 else
                    call stdlib${ii}$_zlasr( 'L', 'V', 'F', imax-imin+1, m-p,rwork(iu2cs+imin-1), rwork(&
                              iu2sn+imin-1),u2(imin,1_${ik}$), ldu2 )
                 end if
              end if
              if( wantv1t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_zlasr( 'L', 'V', 'F', imax-imin+1, q,rwork(iv1tcs+imin-1), rwork(&
                              iv1tsn+imin-1),v1t(imin,1_${ik}$), ldv1t )
                 else
                    call stdlib${ii}$_zlasr( 'R', 'V', 'F', q, imax-imin+1,rwork(iv1tcs+imin-1), rwork(&
                              iv1tsn+imin-1),v1t(1_${ik}$,imin), ldv1t )
                 end if
              end if
              if( wantv2t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_zlasr( 'L', 'V', 'F', imax-imin+1, m-q,rwork(iv2tcs+imin-1), &
                              rwork(iv2tsn+imin-1),v2t(imin,1_${ik}$), ldv2t )
                 else
                    call stdlib${ii}$_zlasr( 'R', 'V', 'F', m-q, imax-imin+1,rwork(iv2tcs+imin-1), &
                              rwork(iv2tsn+imin-1),v2t(1_${ik}$,imin), ldv2t )
                 end if
              end if
              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
              if( b11e(imax-1)+b21e(imax-1) > 0_${ik}$ ) then
                 b11d(imax) = -b11d(imax)
                 b21d(imax) = -b21d(imax)
                 if( wantv1t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_zscal( q, cnegone, v1t(imax,1_${ik}$), ldv1t )
                    else
                       call stdlib${ii}$_zscal( q, cnegone, v1t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! compute theta(imax)
              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
              theta(imax) = atan2( abs(y1), abs(x1) )
              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
              ! and b22(imax,imax-1)
              if( b11d(imax)+b12e(imax-1) < 0_${ik}$ ) then
                 b12d(imax) = -b12d(imax)
                 if( wantu1 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_zscal( p, cnegone, u1(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_zscal( p, cnegone, u1(imax,1_${ik}$), ldu1 )
                    end if
                 end if
              end if
              if( b21d(imax)+b22e(imax-1) > 0_${ik}$ ) then
                 b22d(imax) = -b22d(imax)
                 if( wantu2 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_zscal( m-p, cnegone, u2(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_zscal( m-p, cnegone, u2(imax,1_${ik}$), ldu2 )
                    end if
                 end if
              end if
              ! fix signs on b12(imax,imax) and b22(imax,imax)
              if( b12d(imax)+b22d(imax) < 0_${ik}$ ) then
                 if( wantv2t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_zscal( m-q, cnegone, v2t(imax,1_${ik}$), ldv2t )
                    else
                       call stdlib${ii}$_zscal( m-q, cnegone, v2t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! test for negligible sines or cosines
              do i = imin, imax
                 if( theta(i) < thresh ) then
                    theta(i) = zero
                 else if( theta(i) > piover2-thresh ) then
                    theta(i) = piover2
                 end if
              end do
              do i = imin, imax-1
                 if( phi(i) < thresh ) then
                    phi(i) = zero
                 else if( phi(i) > piover2-thresh ) then
                    phi(i) = piover2
                 end if
              end do
              ! deflate
              if (imax > 1_${ik}$) then
                 do while( phi(imax-1) == zero )
                    imax = imax - 1_${ik}$
                    if (imax <= 1) exit
                 end do
              end if
              if( imin > imax - 1_${ik}$ )imin = imax - 1_${ik}$
              if (imin > 1_${ik}$) then
                 do while (phi(imin-1) /= zero)
                     imin = imin - 1_${ik}$
                     if (imin <= 1) exit
                 end do
              end if
              ! repeat main iteration loop
           end do
           ! postprocessing: order theta from least to greatest
           do i = 1, q
              mini = i
              thetamin = theta(i)
              do j = i+1, q
                 if( theta(j) < thetamin ) then
                    mini = j
                    thetamin = theta(j)
                 end if
              end do
              if( mini /= i ) then
                 theta(mini) = theta(i)
                 theta(i) = thetamin
                 if( colmajor ) then
                    if( wantu1 )call stdlib${ii}$_zswap( p, u1(1_${ik}$,i), 1_${ik}$, u1(1_${ik}$,mini), 1_${ik}$ )
                    if( wantu2 )call stdlib${ii}$_zswap( m-p, u2(1_${ik}$,i), 1_${ik}$, u2(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv1t )call stdlib${ii}$_zswap( q, v1t(i,1_${ik}$), ldv1t, v1t(mini,1_${ik}$), ldv1t )
                              
                    if( wantv2t )call stdlib${ii}$_zswap( m-q, v2t(i,1_${ik}$), ldv2t, v2t(mini,1_${ik}$),ldv2t )
                              
                 else
                    if( wantu1 )call stdlib${ii}$_zswap( p, u1(i,1_${ik}$), ldu1, u1(mini,1_${ik}$), ldu1 )
                    if( wantu2 )call stdlib${ii}$_zswap( m-p, u2(i,1_${ik}$), ldu2, u2(mini,1_${ik}$), ldu2 )
                    if( wantv1t )call stdlib${ii}$_zswap( q, v1t(1_${ik}$,i), 1_${ik}$, v1t(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv2t )call stdlib${ii}$_zswap( m-q, v2t(1_${ik}$,i), 1_${ik}$, v2t(1_${ik}$,mini), 1_${ik}$ )
                 end if
              end if
           end do
           return
     end subroutine stdlib${ii}$_zbbcsd

#:for ck,ct,ci in CMPLX_KINDS_TYPES
#:if not ck in ["sp","dp"]
     pure module subroutine stdlib${ii}$_${ci}$bbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
     !! ZBBCSD: computes the CS decomposition of a unitary matrix in
     !! bidiagonal-block form,
     !! [ B11 | B12 0  0 ]
     !! [  0  |  0 -I  0 ]
     !! X = [----------------]
     !! [ B21 | B22 0  0 ]
     !! [  0  |  0  0  I ]
     !! [  C | -S  0  0 ]
     !! [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**H
     !! = [---------] [---------------] [---------]   .
     !! [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
     !! [  0 |  0  0  I ]
     !! X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
     !! than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
     !! transposed and/or permuted. This can be done in constant time using
     !! the TRANS and SIGNS options. See ZUNCSD for details.)
     !! The bidiagonal matrices B11, B12, B21, and B22 are represented
     !! implicitly by angles THETA(1:Q) and PHI(1:Q-1).
     !! The unitary matrices U1, U2, V1T, and V2T are input/output.
     !! The input matrices are pre- or post-multiplied by the appropriate
     !! singular vector matrices.
     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, rwork, &
               lrwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${ck}$, only: negone, zero, one, ten, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lrwork, m, p, q
           ! Array Arguments 
           real(${ck}$), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
                      b22e(*), rwork(*)
           real(${ck}$), intent(inout) :: phi(*), theta(*)
           complex(${ck}$), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
                     
        ! ===================================================================
           ! Parameters 
           integer(${ik}$), parameter :: maxitr = 6_${ik}$
           real(${ck}$), parameter :: hundred = 100.0_${ck}$
           real(${ck}$), parameter :: meighth = -0.125_${ck}$
           real(${ck}$), parameter :: piover2 = 1.57079632679489661923132169163975144210_${ck}$
           
           ! Local Scalars 
           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
                     wantu2, wantv1t, wantv2t
           integer(${ik}$) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
                     iv2tcs, iv2tsn, j, lrworkmin, lrworkopt, maxit, mini
           real(${ck}$) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
           ! Intrinsic Functions 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lrwork == -1_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           if( m < 0_${ik}$ ) then
              info = -6_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -7_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -8_${ik}$
           else if( q > p .or. q > m-p .or. q > m-q ) then
              info = -8_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -12_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -14_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -16_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -18_${ik}$
           end if
           ! quick return if q = 0
           if( info == 0_${ik}$ .and. q == 0_${ik}$ ) then
              lrworkmin = 1_${ik}$
              rwork(1_${ik}$) = lrworkmin
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              iu1cs = 1_${ik}$
              iu1sn = iu1cs + q
              iu2cs = iu1sn + q
              iu2sn = iu2cs + q
              iv1tcs = iu2sn + q
              iv1tsn = iv1tcs + q
              iv2tcs = iv1tsn + q
              iv2tsn = iv2tcs + q
              lrworkopt = iv2tsn + q - 1_${ik}$
              lrworkmin = lrworkopt
              rwork(1_${ik}$) = lrworkopt
              if( lrwork < lrworkmin .and. .not. lquery ) then
                 info = -28_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZBBCSD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! get machine constants
           eps = stdlib${ii}$_${c2ri(ci)}$lamch( 'EPSILON' )
           unfl = stdlib${ii}$_${c2ri(ci)}$lamch( 'SAFE MINIMUM' )
           tolmul = max( ten, min( hundred, eps**meighth ) )
           tol = tolmul*eps
           thresh = max( tol, maxitr*q*q*unfl )
           ! test for negligible sines or cosines
           do i = 1, q
              if( theta(i) < thresh ) then
                 theta(i) = zero
              else if( theta(i) > piover2-thresh ) then
                 theta(i) = piover2
              end if
           end do
           do i = 1, q-1
              if( phi(i) < thresh ) then
                 phi(i) = zero
              else if( phi(i) > piover2-thresh ) then
                 phi(i) = piover2
              end if
           end do
           ! initial deflation
           imax = q
           do while( imax > 1 )
              if( phi(imax-1) /= zero ) then
                 exit
              end if
              imax = imax - 1_${ik}$
           end do
           imin = imax - 1_${ik}$
           if  ( imin > 1_${ik}$ ) then
              do while( phi(imin-1) /= zero )
                 imin = imin - 1_${ik}$
                 if  ( imin <= 1 ) exit
              end do
           end if
           ! initialize iteration counter
           maxit = maxitr*q*q
           iter = 0_${ik}$
           ! begin main iteration loop
           do while( imax > 1 )
              ! compute the matrix entries
              b11d(imin) = cos( theta(imin) )
              b21d(imin) = -sin( theta(imin) )
              do i = imin, imax - 1
                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
              end do
              b12d(imax) = sin( theta(imax) )
              b22d(imax) = cos( theta(imax) )
              ! abort if not converging; otherwise, increment iter
              if( iter > maxit ) then
                 info = 0_${ik}$
                 do i = 1, q
                    if( phi(i) /= zero )info = info + 1_${ik}$
                 end do
                 return
              end if
              iter = iter + imax - imin
              ! compute shifts
              thetamax = theta(imin)
              thetamin = theta(imin)
              do i = imin+1, imax
                 if( theta(i) > thetamax )thetamax = theta(i)
                 if( theta(i) < thetamin )thetamin = theta(i)
              end do
              if( thetamax > piover2 - thresh ) then
                 ! zero on diagonals of b11 and b22; induce deflation with a
                 ! zero shift
                 mu = zero
                 nu = one
              else if( thetamin < thresh ) then
                 ! zero on diagonals of b12 and b22; induce deflation with a
                 ! zero shift
                 mu = one
                 nu = zero
              else
                 ! compute shifts for b11 and b21 and use the lesser
                 call stdlib${ii}$_${c2ri(ci)}$las2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
                           
                 call stdlib${ii}$_${c2ri(ci)}$las2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
                           
                 if( sigma11 <= sigma21 ) then
                    mu = sigma11
                    nu = sqrt( one - mu**2_${ik}$ )
                    if( mu < thresh ) then
                       mu = zero
                       nu = one
                    end if
                 else
                    nu = sigma21
                    mu = sqrt( one - nu**2_${ik}$ )
                    if( nu < thresh ) then
                       mu = one
                       nu = zero
                    end if
                 end if
              end if
              ! rotate to produce bulges in b11 and b21
              if( mu <= nu ) then
                 call stdlib${ii}$_${c2ri(ci)}$lartgs( b11d(imin), b11e(imin), mu,rwork(iv1tcs+imin-1), rwork(&
                           iv1tsn+imin-1) )
              else
                 call stdlib${ii}$_${c2ri(ci)}$lartgs( b21d(imin), b21e(imin), nu,rwork(iv1tcs+imin-1), rwork(&
                           iv1tsn+imin-1) )
              end if
              temp = rwork(iv1tcs+imin-1)*b11d(imin) +rwork(iv1tsn+imin-1)*b11e(imin)
              b11e(imin) = rwork(iv1tcs+imin-1)*b11e(imin) -rwork(iv1tsn+imin-1)*b11d(imin)
                        
              b11d(imin) = temp
              b11bulge = rwork(iv1tsn+imin-1)*b11d(imin+1)
              b11d(imin+1) = rwork(iv1tcs+imin-1)*b11d(imin+1)
              temp = rwork(iv1tcs+imin-1)*b21d(imin) +rwork(iv1tsn+imin-1)*b21e(imin)
              b21e(imin) = rwork(iv1tcs+imin-1)*b21e(imin) -rwork(iv1tsn+imin-1)*b21d(imin)
                        
              b21d(imin) = temp
              b21bulge = rwork(iv1tsn+imin-1)*b21d(imin+1)
              b21d(imin+1) = rwork(iv1tcs+imin-1)*b21d(imin+1)
              ! compute theta(imin)
              theta( imin ) = atan2( sqrt( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ ),sqrt( b11d(imin)**2_${ik}$+&
                        b11bulge**2_${ik}$ ) )
              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
              if( b11d(imin)**2_${ik}$+b11bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_${c2ri(ci)}$lartgp( b11bulge, b11d(imin), rwork(iu1sn+imin-1),rwork(iu1cs+imin-&
                           1_${ik}$), r )
              else if( mu <= nu ) then
                 call stdlib${ii}$_${c2ri(ci)}$lartgs( b11e( imin ), b11d( imin + 1_${ik}$ ), mu,rwork(iu1cs+imin-1), &
                           rwork(iu1sn+imin-1) )
              else
                 call stdlib${ii}$_${c2ri(ci)}$lartgs( b12d( imin ), b12e( imin ), nu,rwork(iu1cs+imin-1), rwork(&
                           iu1sn+imin-1) )
              end if
              if( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_${c2ri(ci)}$lartgp( b21bulge, b21d(imin), rwork(iu2sn+imin-1),rwork(iu2cs+imin-&
                           1_${ik}$), r )
              else if( nu < mu ) then
                 call stdlib${ii}$_${c2ri(ci)}$lartgs( b21e( imin ), b21d( imin + 1_${ik}$ ), nu,rwork(iu2cs+imin-1), &
                           rwork(iu2sn+imin-1) )
              else
                 call stdlib${ii}$_${c2ri(ci)}$lartgs( b22d(imin), b22e(imin), mu,rwork(iu2cs+imin-1), rwork(iu2sn+&
                           imin-1) )
              end if
              rwork(iu2cs+imin-1) = -rwork(iu2cs+imin-1)
              rwork(iu2sn+imin-1) = -rwork(iu2sn+imin-1)
              temp = rwork(iu1cs+imin-1)*b11e(imin) +rwork(iu1sn+imin-1)*b11d(imin+1)
              b11d(imin+1) = rwork(iu1cs+imin-1)*b11d(imin+1) -rwork(iu1sn+imin-1)*b11e(imin)
                        
              b11e(imin) = temp
              if( imax > imin+1 ) then
                 b11bulge = rwork(iu1sn+imin-1)*b11e(imin+1)
                 b11e(imin+1) = rwork(iu1cs+imin-1)*b11e(imin+1)
              end if
              temp = rwork(iu1cs+imin-1)*b12d(imin) +rwork(iu1sn+imin-1)*b12e(imin)
              b12e(imin) = rwork(iu1cs+imin-1)*b12e(imin) -rwork(iu1sn+imin-1)*b12d(imin)
              b12d(imin) = temp
              b12bulge = rwork(iu1sn+imin-1)*b12d(imin+1)
              b12d(imin+1) = rwork(iu1cs+imin-1)*b12d(imin+1)
              temp = rwork(iu2cs+imin-1)*b21e(imin) +rwork(iu2sn+imin-1)*b21d(imin+1)
              b21d(imin+1) = rwork(iu2cs+imin-1)*b21d(imin+1) -rwork(iu2sn+imin-1)*b21e(imin)
                        
              b21e(imin) = temp
              if( imax > imin+1 ) then
                 b21bulge = rwork(iu2sn+imin-1)*b21e(imin+1)
                 b21e(imin+1) = rwork(iu2cs+imin-1)*b21e(imin+1)
              end if
              temp = rwork(iu2cs+imin-1)*b22d(imin) +rwork(iu2sn+imin-1)*b22e(imin)
              b22e(imin) = rwork(iu2cs+imin-1)*b22e(imin) -rwork(iu2sn+imin-1)*b22d(imin)
              b22d(imin) = temp
              b22bulge = rwork(iu2sn+imin-1)*b22d(imin+1)
              b22d(imin+1) = rwork(iu2cs+imin-1)*b22d(imin+1)
              ! inner loop: chase bulges from b11(imin,imin+2),
              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
              ! bottom-right
              do i = imin+1, imax-1
                 ! compute phi(i-1)
                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
                 phi(i-1) = atan2( sqrt(x1**2_${ik}$+x2**2_${ik}$), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 = b11e(i-1)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 = b21e(i-1)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12d(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22d(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgp( x2, x1, rwork(iv1tsn+i-1),rwork(iv1tcs+i-1), r )
                 else if( .not. restart11 .and. restart21 ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgp( b11bulge, b11e(i-1), rwork(iv1tsn+i-1),rwork(iv1tcs+i-1),&
                               r )
                 else if( restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgp( b21bulge, b21e(i-1), rwork(iv1tsn+i-1),rwork(iv1tcs+i-1),&
                               r )
                 else if( mu <= nu ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgs( b11d(i), b11e(i), mu, rwork(iv1tcs+i-1),rwork(iv1tsn+i-1)&
                               )
                 else
                    call stdlib${ii}$_${c2ri(ci)}$lartgs( b21d(i), b21e(i), nu, rwork(iv1tcs+i-1),rwork(iv1tsn+i-1)&
                               )
                 end if
                 rwork(iv1tcs+i-1) = -rwork(iv1tcs+i-1)
                 rwork(iv1tsn+i-1) = -rwork(iv1tsn+i-1)
                 if( .not. restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgp( y2, y1, rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-1-1), r )
                              
                 else if( .not. restart12 .and. restart22 ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgp( b12bulge, b12d(i-1), rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-&
                              1_${ik}$-1), r )
                 else if( restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgp( b22bulge, b22d(i-1), rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-&
                              1_${ik}$-1), r )
                 else if( nu < mu ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgs( b12e(i-1), b12d(i), nu,rwork(iv2tcs+i-1-1), rwork(iv2tsn+&
                              i-1-1) )
                 else
                    call stdlib${ii}$_${c2ri(ci)}$lartgs( b22e(i-1), b22d(i), mu,rwork(iv2tcs+i-1-1), rwork(iv2tsn+&
                              i-1-1) )
                 end if
                 temp = rwork(iv1tcs+i-1)*b11d(i) + rwork(iv1tsn+i-1)*b11e(i)
                 b11e(i) = rwork(iv1tcs+i-1)*b11e(i) -rwork(iv1tsn+i-1)*b11d(i)
                 b11d(i) = temp
                 b11bulge = rwork(iv1tsn+i-1)*b11d(i+1)
                 b11d(i+1) = rwork(iv1tcs+i-1)*b11d(i+1)
                 temp = rwork(iv1tcs+i-1)*b21d(i) + rwork(iv1tsn+i-1)*b21e(i)
                 b21e(i) = rwork(iv1tcs+i-1)*b21e(i) -rwork(iv1tsn+i-1)*b21d(i)
                 b21d(i) = temp
                 b21bulge = rwork(iv1tsn+i-1)*b21d(i+1)
                 b21d(i+1) = rwork(iv1tcs+i-1)*b21d(i+1)
                 temp = rwork(iv2tcs+i-1-1)*b12e(i-1) +rwork(iv2tsn+i-1-1)*b12d(i)
                 b12d(i) = rwork(iv2tcs+i-1-1)*b12d(i) -rwork(iv2tsn+i-1-1)*b12e(i-1)
                 b12e(i-1) = temp
                 b12bulge = rwork(iv2tsn+i-1-1)*b12e(i)
                 b12e(i) = rwork(iv2tcs+i-1-1)*b12e(i)
                 temp = rwork(iv2tcs+i-1-1)*b22e(i-1) +rwork(iv2tsn+i-1-1)*b22d(i)
                 b22d(i) = rwork(iv2tcs+i-1-1)*b22d(i) -rwork(iv2tsn+i-1-1)*b22e(i-1)
                 b22e(i-1) = temp
                 b22bulge = rwork(iv2tsn+i-1-1)*b22e(i)
                 b22e(i) = rwork(iv2tcs+i-1-1)*b22e(i)
                 ! compute theta(i)
                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
                 theta(i) = atan2( sqrt(y1**2_${ik}$+y2**2_${ik}$), sqrt(x1**2_${ik}$+x2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 =   b11d(i)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12e(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 =   b21d(i)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22e(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgp( x2, x1, rwork(iu1sn+i-1), rwork(iu1cs+i-1),r )
                 else if( .not. restart11 .and. restart12 ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgp( b11bulge, b11d(i), rwork(iu1sn+i-1),rwork(iu1cs+i-1), r )
                              
                 else if( restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgp( b12bulge, b12e(i-1), rwork(iu1sn+i-1),rwork(iu1cs+i-1), &
                              r )
                 else if( mu <= nu ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgs( b11e(i), b11d(i+1), mu, rwork(iu1cs+i-1),rwork(iu1sn+i-1)&
                               )
                 else
                    call stdlib${ii}$_${c2ri(ci)}$lartgs( b12d(i), b12e(i), nu, rwork(iu1cs+i-1),rwork(iu1sn+i-1) )
                              
                 end if
                 if( .not. restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgp( y2, y1, rwork(iu2sn+i-1), rwork(iu2cs+i-1),r )
                 else if( .not. restart21 .and. restart22 ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgp( b21bulge, b21d(i), rwork(iu2sn+i-1),rwork(iu2cs+i-1), r )
                              
                 else if( restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgp( b22bulge, b22e(i-1), rwork(iu2sn+i-1),rwork(iu2cs+i-1), &
                              r )
                 else if( nu < mu ) then
                    call stdlib${ii}$_${c2ri(ci)}$lartgs( b21e(i), b21e(i+1), nu, rwork(iu2cs+i-1),rwork(iu2sn+i-1)&
                               )
                 else
                    call stdlib${ii}$_${c2ri(ci)}$lartgs( b22d(i), b22e(i), mu, rwork(iu2cs+i-1),rwork(iu2sn+i-1) )
                              
                 end if
                 rwork(iu2cs+i-1) = -rwork(iu2cs+i-1)
                 rwork(iu2sn+i-1) = -rwork(iu2sn+i-1)
                 temp = rwork(iu1cs+i-1)*b11e(i) + rwork(iu1sn+i-1)*b11d(i+1)
                 b11d(i+1) = rwork(iu1cs+i-1)*b11d(i+1) -rwork(iu1sn+i-1)*b11e(i)
                 b11e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b11bulge = rwork(iu1sn+i-1)*b11e(i+1)
                    b11e(i+1) = rwork(iu1cs+i-1)*b11e(i+1)
                 end if
                 temp = rwork(iu2cs+i-1)*b21e(i) + rwork(iu2sn+i-1)*b21d(i+1)
                 b21d(i+1) = rwork(iu2cs+i-1)*b21d(i+1) -rwork(iu2sn+i-1)*b21e(i)
                 b21e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b21bulge = rwork(iu2sn+i-1)*b21e(i+1)
                    b21e(i+1) = rwork(iu2cs+i-1)*b21e(i+1)
                 end if
                 temp = rwork(iu1cs+i-1)*b12d(i) + rwork(iu1sn+i-1)*b12e(i)
                 b12e(i) = rwork(iu1cs+i-1)*b12e(i) -rwork(iu1sn+i-1)*b12d(i)
                 b12d(i) = temp
                 b12bulge = rwork(iu1sn+i-1)*b12d(i+1)
                 b12d(i+1) = rwork(iu1cs+i-1)*b12d(i+1)
                 temp = rwork(iu2cs+i-1)*b22d(i) + rwork(iu2sn+i-1)*b22e(i)
                 b22e(i) = rwork(iu2cs+i-1)*b22e(i) -rwork(iu2sn+i-1)*b22d(i)
                 b22d(i) = temp
                 b22bulge = rwork(iu2sn+i-1)*b22d(i+1)
                 b22d(i+1) = rwork(iu2cs+i-1)*b22d(i+1)
              end do
              ! compute phi(imax-1)
              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
              phi(imax-1) = atan2( abs(x1), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
              restart12 = b12d(imax-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
              restart22 = b22d(imax-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
              if( .not. restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_${c2ri(ci)}$lartgp( y2, y1, rwork(iv2tsn+imax-1-1),rwork(iv2tcs+imax-1-1), r )
                           
              else if( .not. restart12 .and. restart22 ) then
                 call stdlib${ii}$_${c2ri(ci)}$lartgp( b12bulge, b12d(imax-1),rwork(iv2tsn+imax-1-1),rwork(iv2tcs+&
                           imax-1-1), r )
              else if( restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_${c2ri(ci)}$lartgp( b22bulge, b22d(imax-1),rwork(iv2tsn+imax-1-1),rwork(iv2tcs+&
                           imax-1-1), r )
              else if( nu < mu ) then
                 call stdlib${ii}$_${c2ri(ci)}$lartgs( b12e(imax-1), b12d(imax), nu,rwork(iv2tcs+imax-1-1),rwork(&
                           iv2tsn+imax-1-1) )
              else
                 call stdlib${ii}$_${c2ri(ci)}$lartgs( b22e(imax-1), b22d(imax), mu,rwork(iv2tcs+imax-1-1),rwork(&
                           iv2tsn+imax-1-1) )
              end if
              temp = rwork(iv2tcs+imax-1-1)*b12e(imax-1) +rwork(iv2tsn+imax-1-1)*b12d(imax)
                        
              b12d(imax) = rwork(iv2tcs+imax-1-1)*b12d(imax) -rwork(iv2tsn+imax-1-1)*b12e(imax-1)
                        
              b12e(imax-1) = temp
              temp = rwork(iv2tcs+imax-1-1)*b22e(imax-1) +rwork(iv2tsn+imax-1-1)*b22d(imax)
                        
              b22d(imax) = rwork(iv2tcs+imax-1-1)*b22d(imax) -rwork(iv2tsn+imax-1-1)*b22e(imax-1)
                        
              b22e(imax-1) = temp
              ! update singular vectors
              if( wantu1 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_${ci}$lasr( 'R', 'V', 'F', p, imax-imin+1,rwork(iu1cs+imin-1), rwork(&
                              iu1sn+imin-1),u1(1_${ik}$,imin), ldu1 )
                 else
                    call stdlib${ii}$_${ci}$lasr( 'L', 'V', 'F', imax-imin+1, p,rwork(iu1cs+imin-1), rwork(&
                              iu1sn+imin-1),u1(imin,1_${ik}$), ldu1 )
                 end if
              end if
              if( wantu2 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_${ci}$lasr( 'R', 'V', 'F', m-p, imax-imin+1,rwork(iu2cs+imin-1), rwork(&
                              iu2sn+imin-1),u2(1_${ik}$,imin), ldu2 )
                 else
                    call stdlib${ii}$_${ci}$lasr( 'L', 'V', 'F', imax-imin+1, m-p,rwork(iu2cs+imin-1), rwork(&
                              iu2sn+imin-1),u2(imin,1_${ik}$), ldu2 )
                 end if
              end if
              if( wantv1t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_${ci}$lasr( 'L', 'V', 'F', imax-imin+1, q,rwork(iv1tcs+imin-1), rwork(&
                              iv1tsn+imin-1),v1t(imin,1_${ik}$), ldv1t )
                 else
                    call stdlib${ii}$_${ci}$lasr( 'R', 'V', 'F', q, imax-imin+1,rwork(iv1tcs+imin-1), rwork(&
                              iv1tsn+imin-1),v1t(1_${ik}$,imin), ldv1t )
                 end if
              end if
              if( wantv2t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_${ci}$lasr( 'L', 'V', 'F', imax-imin+1, m-q,rwork(iv2tcs+imin-1), &
                              rwork(iv2tsn+imin-1),v2t(imin,1_${ik}$), ldv2t )
                 else
                    call stdlib${ii}$_${ci}$lasr( 'R', 'V', 'F', m-q, imax-imin+1,rwork(iv2tcs+imin-1), &
                              rwork(iv2tsn+imin-1),v2t(1_${ik}$,imin), ldv2t )
                 end if
              end if
              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
              if( b11e(imax-1)+b21e(imax-1) > 0_${ik}$ ) then
                 b11d(imax) = -b11d(imax)
                 b21d(imax) = -b21d(imax)
                 if( wantv1t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_${ci}$scal( q, cnegone, v1t(imax,1_${ik}$), ldv1t )
                    else
                       call stdlib${ii}$_${ci}$scal( q, cnegone, v1t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! compute theta(imax)
              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
              theta(imax) = atan2( abs(y1), abs(x1) )
              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
              ! and b22(imax,imax-1)
              if( b11d(imax)+b12e(imax-1) < 0_${ik}$ ) then
                 b12d(imax) = -b12d(imax)
                 if( wantu1 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_${ci}$scal( p, cnegone, u1(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_${ci}$scal( p, cnegone, u1(imax,1_${ik}$), ldu1 )
                    end if
                 end if
              end if
              if( b21d(imax)+b22e(imax-1) > 0_${ik}$ ) then
                 b22d(imax) = -b22d(imax)
                 if( wantu2 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_${ci}$scal( m-p, cnegone, u2(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_${ci}$scal( m-p, cnegone, u2(imax,1_${ik}$), ldu2 )
                    end if
                 end if
              end if
              ! fix signs on b12(imax,imax) and b22(imax,imax)
              if( b12d(imax)+b22d(imax) < 0_${ik}$ ) then
                 if( wantv2t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_${ci}$scal( m-q, cnegone, v2t(imax,1_${ik}$), ldv2t )
                    else
                       call stdlib${ii}$_${ci}$scal( m-q, cnegone, v2t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! test for negligible sines or cosines
              do i = imin, imax
                 if( theta(i) < thresh ) then
                    theta(i) = zero
                 else if( theta(i) > piover2-thresh ) then
                    theta(i) = piover2
                 end if
              end do
              do i = imin, imax-1
                 if( phi(i) < thresh ) then
                    phi(i) = zero
                 else if( phi(i) > piover2-thresh ) then
                    phi(i) = piover2
                 end if
              end do
              ! deflate
              if (imax > 1_${ik}$) then
                 do while( phi(imax-1) == zero )
                    imax = imax - 1_${ik}$
                    if (imax <= 1) exit
                 end do
              end if
              if( imin > imax - 1_${ik}$ )imin = imax - 1_${ik}$
              if (imin > 1_${ik}$) then
                 do while (phi(imin-1) /= zero)
                     imin = imin - 1_${ik}$
                     if (imin <= 1) exit
                 end do
              end if
              ! repeat main iteration loop
           end do
           ! postprocessing: order theta from least to greatest
           do i = 1, q
              mini = i
              thetamin = theta(i)
              do j = i+1, q
                 if( theta(j) < thetamin ) then
                    mini = j
                    thetamin = theta(j)
                 end if
              end do
              if( mini /= i ) then
                 theta(mini) = theta(i)
                 theta(i) = thetamin
                 if( colmajor ) then
                    if( wantu1 )call stdlib${ii}$_${ci}$swap( p, u1(1_${ik}$,i), 1_${ik}$, u1(1_${ik}$,mini), 1_${ik}$ )
                    if( wantu2 )call stdlib${ii}$_${ci}$swap( m-p, u2(1_${ik}$,i), 1_${ik}$, u2(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv1t )call stdlib${ii}$_${ci}$swap( q, v1t(i,1_${ik}$), ldv1t, v1t(mini,1_${ik}$), ldv1t )
                              
                    if( wantv2t )call stdlib${ii}$_${ci}$swap( m-q, v2t(i,1_${ik}$), ldv2t, v2t(mini,1_${ik}$),ldv2t )
                              
                 else
                    if( wantu1 )call stdlib${ii}$_${ci}$swap( p, u1(i,1_${ik}$), ldu1, u1(mini,1_${ik}$), ldu1 )
                    if( wantu2 )call stdlib${ii}$_${ci}$swap( m-p, u2(i,1_${ik}$), ldu2, u2(mini,1_${ik}$), ldu2 )
                    if( wantv1t )call stdlib${ii}$_${ci}$swap( q, v1t(1_${ik}$,i), 1_${ik}$, v1t(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv2t )call stdlib${ii}$_${ci}$swap( m-q, v2t(1_${ik}$,i), 1_${ik}$, v2t(1_${ik}$,mini), 1_${ik}$ )
                 end if
              end if
           end do
           return
     end subroutine stdlib${ii}$_${ci}$bbcsd

#:endif
#:endfor



     recursive module subroutine stdlib${ii}$_cuncsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, x11, &
     !! CUNCSD computes the CS decomposition of an M-by-M partitioned
     !! unitary matrix X:
     !! [  I  0  0 |  0  0  0 ]
     !! [  0  C  0 |  0 -S  0 ]
     !! [ X11 | X12 ]   [ U1 |    ] [  0  0  0 |  0  0 -I ] [ V1 |    ]**H
     !! X = [-----------] = [---------] [---------------------] [---------]   .
     !! [ X21 | X22 ]   [    | U2 ] [  0  0  0 |  I  0  0 ] [    | V2 ]
     !! [  0  S  0 |  0  C  0 ]
     !! [  0  0  I |  0  0  0 ]
     !! X11 is P-by-Q. The unitary matrices U1, U2, V1, and V2 are P-by-P,
     !! (M-P)-by-(M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. C and S are
     !! R-by-R nonnegative diagonal matrices satisfying C^2 + S^2 = I, in
     !! which R = MIN(P,M-P,Q,M-Q).
     ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, &
               work, lwork, rwork, lrwork,iwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, signs, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, ldx11, ldx12, ldx21, ldx22, &
                     lrwork, lwork, m, p, q
           ! Array Arguments 
           integer(${ik}$), intent(out) :: iwork(*)
           real(sp), intent(out) :: theta(*)
           real(sp), intent(out) :: rwork(*)
           complex(sp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*), work(*)
                     
           complex(sp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
                     
        ! ===================================================================
           
           ! Local Scalars 
           character :: transt, signst
           integer(${ik}$) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, itauq2, j, lbbcsdwork, &
           lbbcsdworkmin, lbbcsdworkopt, lorbdbwork, lorbdbworkmin, lorbdbworkopt, lorglqwork, &
           lorglqworkmin, lorglqworkopt, lorgqrwork, lorgqrworkmin, lorgqrworkopt, lworkmin, &
                     lworkopt, p1, q1
           logical(lk) :: colmajor, defaultsigns, lquery, wantu1, wantu2, wantv1t, wantv2t
           integer(${ik}$) :: lrworkmin, lrworkopt
           logical(lk) :: lrquery
           ! Intrinsic Functions
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           defaultsigns = .not. stdlib_lsame( signs, 'O' )
           lquery = lwork == -1_${ik}$
           lrquery = lrwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -7_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -8_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -9_${ik}$
           else if ( colmajor .and.  ldx11 < max( 1_${ik}$, p ) ) then
             info = -11_${ik}$
           else if (.not. colmajor .and. ldx11 < max( 1_${ik}$, q ) ) then
             info = -11_${ik}$
           else if (colmajor .and. ldx12 < max( 1_${ik}$, p ) ) then
             info = -13_${ik}$
           else if (.not. colmajor .and. ldx12 < max( 1_${ik}$, m-q ) ) then
             info = -13_${ik}$
           else if (colmajor .and. ldx21 < max( 1_${ik}$, m-p ) ) then
             info = -15_${ik}$
           else if (.not. colmajor .and. ldx21 < max( 1_${ik}$, q ) ) then
             info = -15_${ik}$
           else if (colmajor .and. ldx22 < max( 1_${ik}$, m-p ) ) then
             info = -17_${ik}$
           else if (.not. colmajor .and. ldx22 < max( 1_${ik}$, m-q ) ) then
             info = -17_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -20_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -22_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -24_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -26_${ik}$
           end if
           ! work with transpose if convenient
           if( info == 0_${ik}$ .and. min( p, m-p ) < min( q, m-q ) ) then
              if( colmajor ) then
                 transt = 'T'
              else
                 transt = 'N'
              end if
              if( defaultsigns ) then
                 signst = 'O'
              else
                 signst = 'D'
              end if
              call stdlib${ii}$_cuncsd( jobv1t, jobv2t, jobu1, jobu2, transt, signst, m,q, p, x11, &
              ldx11, x21, ldx21, x12, ldx12, x22,ldx22, theta, v1t, ldv1t, v2t, ldv2t, u1, ldu1,&
                        u2, ldu2, work, lwork, rwork, lrwork, iwork,info )
              return
           end if
           ! work with permutation [ 0 i; i 0 ] * x * [ 0 i; i 0 ] if
           ! convenient
           if( info == 0_${ik}$ .and. m-q < q ) then
              if( defaultsigns ) then
                 signst = 'O'
              else
                 signst = 'D'
              end if
              call stdlib${ii}$_cuncsd( jobu2, jobu1, jobv2t, jobv1t, trans, signst, m,m-p, m-q, x22, &
              ldx22, x21, ldx21, x12, ldx12, x11,ldx11, theta, u2, ldu2, u1, ldu1, v2t, ldv2t, &
                        v1t,ldv1t, work, lwork, rwork, lrwork, iwork, info )
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ! real workspace
              iphi = 2_${ik}$
              ib11d = iphi + max( 1_${ik}$, q - 1_${ik}$ )
              ib11e = ib11d + max( 1_${ik}$, q )
              ib12d = ib11e + max( 1_${ik}$, q - 1_${ik}$ )
              ib12e = ib12d + max( 1_${ik}$, q )
              ib21d = ib12e + max( 1_${ik}$, q - 1_${ik}$ )
              ib21e = ib21d + max( 1_${ik}$, q )
              ib22d = ib21e + max( 1_${ik}$, q - 1_${ik}$ )
              ib22e = ib22d + max( 1_${ik}$, q )
              ibbcsd = ib22e + max( 1_${ik}$, q - 1_${ik}$ )
              call stdlib${ii}$_cbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, theta, u1, &
              ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, theta, theta, theta, theta, theta,theta, &
                        theta, theta, rwork, -1_${ik}$, childinfo )
              lbbcsdworkopt = int( rwork(1_${ik}$),KIND=${ik}$)
              lbbcsdworkmin = lbbcsdworkopt
              lrworkopt = ibbcsd + lbbcsdworkopt - 1_${ik}$
              lrworkmin = ibbcsd + lbbcsdworkmin - 1_${ik}$
              rwork(1_${ik}$) = lrworkopt
              ! complex workspace
              itaup1 = 2_${ik}$
              itaup2 = itaup1 + max( 1_${ik}$, p )
              itauq1 = itaup2 + max( 1_${ik}$, m - p )
              itauq2 = itauq1 + max( 1_${ik}$, q )
              iorgqr = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_cungqr( m-q, m-q, m-q, u1, max(1_${ik}$,m-q), u1, work, -1_${ik}$,childinfo )
              lorgqrworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorgqrworkmin = max( 1_${ik}$, m - q )
              iorglq = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_cunglq( m-q, m-q, m-q, u1, max(1_${ik}$,m-q), u1, work, -1_${ik}$,childinfo )
              lorglqworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorglqworkmin = max( 1_${ik}$, m - q )
              iorbdb = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_cunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
                        ldx22, theta, theta, u1, u2,v1t, v2t, work, -1_${ik}$, childinfo )
              lorbdbworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorbdbworkmin = lorbdbworkopt
              lworkopt = max( iorgqr + lorgqrworkopt, iorglq + lorglqworkopt,iorbdb + &
                        lorbdbworkopt ) - 1_${ik}$
              lworkmin = max( iorgqr + lorgqrworkmin, iorglq + lorglqworkmin,iorbdb + &
                        lorbdbworkmin ) - 1_${ik}$
              work(1_${ik}$) = max(lworkopt,lworkmin)
              if( lwork < lworkmin.and. .not. ( lquery .or. lrquery ) ) then
                 info = -22_${ik}$
              else if( lrwork < lrworkmin.and. .not. ( lquery .or. lrquery ) ) then
                 info = -24_${ik}$
              else
                 lorgqrwork = lwork - iorgqr + 1_${ik}$
                 lorglqwork = lwork - iorglq + 1_${ik}$
                 lorbdbwork = lwork - iorbdb + 1_${ik}$
                 lbbcsdwork = lrwork - ibbcsd + 1_${ik}$
              end if
           end if
           ! abort if any illegal arguments
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'CUNCSD', -info )
              return
           else if( lquery .or. lrquery ) then
              return
           end if
           ! transform to bidiagonal block form
           call stdlib${ii}$_cunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12, x21,ldx21, x22, &
           ldx22, theta, rwork(iphi), work(itaup1),work(itaup2), work(itauq1), work(itauq2),work(&
                     iorbdb), lorbdbwork, childinfo )
           ! accumulate householder reflectors
           if( colmajor ) then
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_clacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_cungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqrwork, &
                           info)
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_clacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_cungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqrwork,&
                            info )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_clacpy( 'U', q-1, q-1, x11(1_${ik}$,2_${ik}$), ldx11, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 v1t(1_${ik}$, 1_${ik}$) = cone
                 do j = 2, q
                    v1t(1_${ik}$,j) = czero
                    v1t(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_cunglq( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorglq), &
                           lorglqwork, info )
              end if
              if( wantv2t .and. m-q > 0_${ik}$ ) then
                 call stdlib${ii}$_clacpy( 'U', p, m-q, x12, ldx12, v2t, ldv2t )
                 if( m-p > q ) then
                    call stdlib${ii}$_clacpy( 'U', m-p-q, m-p-q, x22(q+1,p+1), ldx22,v2t(p+1,p+1), &
                              ldv2t )
                 end if
                 if( m > q ) then
                    call stdlib${ii}$_cunglq( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorglq), &
                              lorglqwork, info )
                 end if
              end if
           else
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_clacpy( 'U', q, p, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_cunglq( p, p, q, u1, ldu1, work(itaup1), work(iorglq),lorglqwork, &
                           info)
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_clacpy( 'U', q, m-p, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_cunglq( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorglq), lorglqwork,&
                            info )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_clacpy( 'L', q-1, q-1, x11(2_${ik}$,1_${ik}$), ldx11, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 v1t(1_${ik}$, 1_${ik}$) = cone
                 do j = 2, q
                    v1t(1_${ik}$,j) = czero
                    v1t(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_cungqr( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorgqr), &
                           lorgqrwork, info )
              end if
              if( wantv2t .and. m-q > 0_${ik}$ ) then
                 p1 = min( p+1, m )
                 q1 = min( q+1, m )
                 call stdlib${ii}$_clacpy( 'L', m-q, p, x12, ldx12, v2t, ldv2t )
                 if ( m > p+q ) then
                    call stdlib${ii}$_clacpy( 'L', m-p-q, m-p-q, x22(p1,q1), ldx22,v2t(p+1,p+1), ldv2t )
                              
                 end if
                 call stdlib${ii}$_cungqr( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorgqr), &
                           lorgqrwork, info )
              end if
           end if
           ! compute the csd of the matrix in bidiagonal-block form
           call stdlib${ii}$_cbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q, theta,rwork(iphi), &
           u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, rwork(ib11d), rwork(ib11e), rwork(ib12d),&
           rwork(ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd),&
                     lbbcsdwork, info )
           ! permute rows and columns to place identity submatrices in top-
           ! left corner of (1,1)-block and/or bottom-right corner of (1,2)-
           ! block and/or bottom-right corner of (2,1)-block and/or top-left
           ! corner of (2,2)-block
           if( q > 0_${ik}$ .and. wantu2 ) then
              do i = 1, q
                 iwork(i) = m - p - q + i
              end do
              do i = q + 1, m - p
                 iwork(i) = i - q
              end do
              if( colmajor ) then
                 call stdlib${ii}$_clapmt( .false., m-p, m-p, u2, ldu2, iwork )
              else
                 call stdlib${ii}$_clapmr( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           end if
           if( m > 0_${ik}$ .and. wantv2t ) then
              do i = 1, p
                 iwork(i) = m - p - q + i
              end do
              do i = p + 1, m - q
                 iwork(i) = i - p
              end do
              if( .not. colmajor ) then
                 call stdlib${ii}$_clapmt( .false., m-q, m-q, v2t, ldv2t, iwork )
              else
                 call stdlib${ii}$_clapmr( .false., m-q, m-q, v2t, ldv2t, iwork )
              end if
           end if
           return
           ! end stdlib${ii}$_cuncsd
     end subroutine stdlib${ii}$_cuncsd

     recursive module subroutine stdlib${ii}$_zuncsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, x11, &
     !! ZUNCSD computes the CS decomposition of an M-by-M partitioned
     !! unitary matrix X:
     !! [  I  0  0 |  0  0  0 ]
     !! [  0  C  0 |  0 -S  0 ]
     !! [ X11 | X12 ]   [ U1 |    ] [  0  0  0 |  0  0 -I ] [ V1 |    ]**H
     !! X = [-----------] = [---------] [---------------------] [---------]   .
     !! [ X21 | X22 ]   [    | U2 ] [  0  0  0 |  I  0  0 ] [    | V2 ]
     !! [  0  S  0 |  0  C  0 ]
     !! [  0  0  I |  0  0  0 ]
     !! X11 is P-by-Q. The unitary matrices U1, U2, V1, and V2 are P-by-P,
     !! (M-P)-by-(M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. C and S are
     !! R-by-R nonnegative diagonal matrices satisfying C^2 + S^2 = I, in
     !! which R = MIN(P,M-P,Q,M-Q).
     ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, &
               work, lwork, rwork, lrwork,iwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, signs, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, ldx11, ldx12, ldx21, ldx22, &
                     lrwork, lwork, m, p, q
           ! Array Arguments 
           integer(${ik}$), intent(out) :: iwork(*)
           real(dp), intent(out) :: theta(*)
           real(dp), intent(out) :: rwork(*)
           complex(dp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*), work(*)
                     
           complex(dp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
                     
        ! ===================================================================
           
           ! Local Scalars 
           character :: transt, signst
           integer(${ik}$) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, itauq2, j, lbbcsdwork, &
           lbbcsdworkmin, lbbcsdworkopt, lorbdbwork, lorbdbworkmin, lorbdbworkopt, lorglqwork, &
           lorglqworkmin, lorglqworkopt, lorgqrwork, lorgqrworkmin, lorgqrworkopt, lworkmin, &
                     lworkopt, p1, q1
           logical(lk) :: colmajor, defaultsigns, lquery, wantu1, wantu2, wantv1t, wantv2t
           integer(${ik}$) :: lrworkmin, lrworkopt
           logical(lk) :: lrquery
           ! Intrinsic Functions
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           defaultsigns = .not. stdlib_lsame( signs, 'O' )
           lquery = lwork == -1_${ik}$
           lrquery = lrwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -7_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -8_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -9_${ik}$
           else if ( colmajor .and.  ldx11 < max( 1_${ik}$, p ) ) then
             info = -11_${ik}$
           else if (.not. colmajor .and. ldx11 < max( 1_${ik}$, q ) ) then
             info = -11_${ik}$
           else if (colmajor .and. ldx12 < max( 1_${ik}$, p ) ) then
             info = -13_${ik}$
           else if (.not. colmajor .and. ldx12 < max( 1_${ik}$, m-q ) ) then
             info = -13_${ik}$
           else if (colmajor .and. ldx21 < max( 1_${ik}$, m-p ) ) then
             info = -15_${ik}$
           else if (.not. colmajor .and. ldx21 < max( 1_${ik}$, q ) ) then
             info = -15_${ik}$
           else if (colmajor .and. ldx22 < max( 1_${ik}$, m-p ) ) then
             info = -17_${ik}$
           else if (.not. colmajor .and. ldx22 < max( 1_${ik}$, m-q ) ) then
             info = -17_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -20_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -22_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -24_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -26_${ik}$
           end if
           ! work with transpose if convenient
           if( info == 0_${ik}$ .and. min( p, m-p ) < min( q, m-q ) ) then
              if( colmajor ) then
                 transt = 'T'
              else
                 transt = 'N'
              end if
              if( defaultsigns ) then
                 signst = 'O'
              else
                 signst = 'D'
              end if
              call stdlib${ii}$_zuncsd( jobv1t, jobv2t, jobu1, jobu2, transt, signst, m,q, p, x11, &
              ldx11, x21, ldx21, x12, ldx12, x22,ldx22, theta, v1t, ldv1t, v2t, ldv2t, u1, ldu1,&
                        u2, ldu2, work, lwork, rwork, lrwork, iwork,info )
              return
           end if
           ! work with permutation [ 0 i; i 0 ] * x * [ 0 i; i 0 ] if
           ! convenient
           if( info == 0_${ik}$ .and. m-q < q ) then
              if( defaultsigns ) then
                 signst = 'O'
              else
                 signst = 'D'
              end if
              call stdlib${ii}$_zuncsd( jobu2, jobu1, jobv2t, jobv1t, trans, signst, m,m-p, m-q, x22, &
              ldx22, x21, ldx21, x12, ldx12, x11,ldx11, theta, u2, ldu2, u1, ldu1, v2t, ldv2t, &
                        v1t,ldv1t, work, lwork, rwork, lrwork, iwork, info )
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ! real workspace
              iphi = 2_${ik}$
              ib11d = iphi + max( 1_${ik}$, q - 1_${ik}$ )
              ib11e = ib11d + max( 1_${ik}$, q )
              ib12d = ib11e + max( 1_${ik}$, q - 1_${ik}$ )
              ib12e = ib12d + max( 1_${ik}$, q )
              ib21d = ib12e + max( 1_${ik}$, q - 1_${ik}$ )
              ib21e = ib21d + max( 1_${ik}$, q )
              ib22d = ib21e + max( 1_${ik}$, q - 1_${ik}$ )
              ib22e = ib22d + max( 1_${ik}$, q )
              ibbcsd = ib22e + max( 1_${ik}$, q - 1_${ik}$ )
              call stdlib${ii}$_zbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, theta, u1, &
              ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, theta, theta, theta, theta, theta,theta, &
                        theta, theta, rwork, -1_${ik}$, childinfo )
              lbbcsdworkopt = int( rwork(1_${ik}$),KIND=${ik}$)
              lbbcsdworkmin = lbbcsdworkopt
              lrworkopt = ibbcsd + lbbcsdworkopt - 1_${ik}$
              lrworkmin = ibbcsd + lbbcsdworkmin - 1_${ik}$
              rwork(1_${ik}$) = lrworkopt
              ! complex workspace
              itaup1 = 2_${ik}$
              itaup2 = itaup1 + max( 1_${ik}$, p )
              itauq1 = itaup2 + max( 1_${ik}$, m - p )
              itauq2 = itauq1 + max( 1_${ik}$, q )
              iorgqr = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_zungqr( m-q, m-q, m-q, u1, max(1_${ik}$,m-q), u1, work, -1_${ik}$,childinfo )
              lorgqrworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorgqrworkmin = max( 1_${ik}$, m - q )
              iorglq = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_zunglq( m-q, m-q, m-q, u1, max(1_${ik}$,m-q), u1, work, -1_${ik}$,childinfo )
              lorglqworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorglqworkmin = max( 1_${ik}$, m - q )
              iorbdb = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_zunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
                        ldx22, theta, theta, u1, u2,v1t, v2t, work, -1_${ik}$, childinfo )
              lorbdbworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorbdbworkmin = lorbdbworkopt
              lworkopt = max( iorgqr + lorgqrworkopt, iorglq + lorglqworkopt,iorbdb + &
                        lorbdbworkopt ) - 1_${ik}$
              lworkmin = max( iorgqr + lorgqrworkmin, iorglq + lorglqworkmin,iorbdb + &
                        lorbdbworkmin ) - 1_${ik}$
              work(1_${ik}$) = max(lworkopt,lworkmin)
              if( lwork < lworkmin.and. .not. ( lquery .or. lrquery ) ) then
                 info = -22_${ik}$
              else if( lrwork < lrworkmin.and. .not. ( lquery .or. lrquery ) ) then
                 info = -24_${ik}$
              else
                 lorgqrwork = lwork - iorgqr + 1_${ik}$
                 lorglqwork = lwork - iorglq + 1_${ik}$
                 lorbdbwork = lwork - iorbdb + 1_${ik}$
                 lbbcsdwork = lrwork - ibbcsd + 1_${ik}$
              end if
           end if
           ! abort if any illegal arguments
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNCSD', -info )
              return
           else if( lquery .or. lrquery ) then
              return
           end if
           ! transform to bidiagonal block form
           call stdlib${ii}$_zunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12, x21,ldx21, x22, &
           ldx22, theta, rwork(iphi), work(itaup1),work(itaup2), work(itauq1), work(itauq2),work(&
                     iorbdb), lorbdbwork, childinfo )
           ! accumulate householder reflectors
           if( colmajor ) then
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_zlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_zungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqrwork, &
                           info)
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_zlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_zungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqrwork,&
                            info )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_zlacpy( 'U', q-1, q-1, x11(1_${ik}$,2_${ik}$), ldx11, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 v1t(1_${ik}$, 1_${ik}$) = cone
                 do j = 2, q
                    v1t(1_${ik}$,j) = czero
                    v1t(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_zunglq( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorglq), &
                           lorglqwork, info )
              end if
              if( wantv2t .and. m-q > 0_${ik}$ ) then
                 call stdlib${ii}$_zlacpy( 'U', p, m-q, x12, ldx12, v2t, ldv2t )
                 if( m-p > q) then
                    call stdlib${ii}$_zlacpy( 'U', m-p-q, m-p-q, x22(q+1,p+1), ldx22,v2t(p+1,p+1), &
                              ldv2t )
                 end if
                 if( m > q ) then
                    call stdlib${ii}$_zunglq( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorglq), &
                              lorglqwork, info )
                 end if
              end if
           else
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_zlacpy( 'U', q, p, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_zunglq( p, p, q, u1, ldu1, work(itaup1), work(iorglq),lorglqwork, &
                           info)
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_zlacpy( 'U', q, m-p, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_zunglq( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorglq), lorglqwork,&
                            info )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_zlacpy( 'L', q-1, q-1, x11(2_${ik}$,1_${ik}$), ldx11, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 v1t(1_${ik}$, 1_${ik}$) = cone
                 do j = 2, q
                    v1t(1_${ik}$,j) = czero
                    v1t(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_zungqr( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorgqr), &
                           lorgqrwork, info )
              end if
              if( wantv2t .and. m-q > 0_${ik}$ ) then
                 p1 = min( p+1, m )
                 q1 = min( q+1, m )
                 call stdlib${ii}$_zlacpy( 'L', m-q, p, x12, ldx12, v2t, ldv2t )
                 if( m > p+q ) then
                    call stdlib${ii}$_zlacpy( 'L', m-p-q, m-p-q, x22(p1,q1), ldx22,v2t(p+1,p+1), ldv2t )
                              
                 end if
                 call stdlib${ii}$_zungqr( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorgqr), &
                           lorgqrwork, info )
              end if
           end if
           ! compute the csd of the matrix in bidiagonal-block form
           call stdlib${ii}$_zbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q, theta,rwork(iphi), &
           u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, rwork(ib11d), rwork(ib11e), rwork(ib12d),&
           rwork(ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd),&
                     lbbcsdwork, info )
           ! permute rows and columns to place identity submatrices in top-
           ! left corner of (1,1)-block and/or bottom-right corner of (1,2)-
           ! block and/or bottom-right corner of (2,1)-block and/or top-left
           ! corner of (2,2)-block
           if( q > 0_${ik}$ .and. wantu2 ) then
              do i = 1, q
                 iwork(i) = m - p - q + i
              end do
              do i = q + 1, m - p
                 iwork(i) = i - q
              end do
              if( colmajor ) then
                 call stdlib${ii}$_zlapmt( .false., m-p, m-p, u2, ldu2, iwork )
              else
                 call stdlib${ii}$_zlapmr( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           end if
           if( m > 0_${ik}$ .and. wantv2t ) then
              do i = 1, p
                 iwork(i) = m - p - q + i
              end do
              do i = p + 1, m - q
                 iwork(i) = i - p
              end do
              if( .not. colmajor ) then
                 call stdlib${ii}$_zlapmt( .false., m-q, m-q, v2t, ldv2t, iwork )
              else
                 call stdlib${ii}$_zlapmr( .false., m-q, m-q, v2t, ldv2t, iwork )
              end if
           end if
           return
           ! end stdlib${ii}$_zuncsd
     end subroutine stdlib${ii}$_zuncsd

#:for ck,ct,ci in CMPLX_KINDS_TYPES
#:if not ck in ["sp","dp"]
     recursive module subroutine stdlib${ii}$_${ci}$uncsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, x11, &
     !! ZUNCSD: computes the CS decomposition of an M-by-M partitioned
     !! unitary matrix X:
     !! [  I  0  0 |  0  0  0 ]
     !! [  0  C  0 |  0 -S  0 ]
     !! [ X11 | X12 ]   [ U1 |    ] [  0  0  0 |  0  0 -I ] [ V1 |    ]**H
     !! X = [-----------] = [---------] [---------------------] [---------]   .
     !! [ X21 | X22 ]   [    | U2 ] [  0  0  0 |  I  0  0 ] [    | V2 ]
     !! [  0  S  0 |  0  C  0 ]
     !! [  0  0  I |  0  0  0 ]
     !! X11 is P-by-Q. The unitary matrices U1, U2, V1, and V2 are P-by-P,
     !! (M-P)-by-(M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. C and S are
     !! R-by-R nonnegative diagonal matrices satisfying C^2 + S^2 = I, in
     !! which R = MIN(P,M-P,Q,M-Q).
     ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, &
               work, lwork, rwork, lrwork,iwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${ck}$, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, signs, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, ldx11, ldx12, ldx21, ldx22, &
                     lrwork, lwork, m, p, q
           ! Array Arguments 
           integer(${ik}$), intent(out) :: iwork(*)
           real(${ck}$), intent(out) :: theta(*)
           real(${ck}$), intent(out) :: rwork(*)
           complex(${ck}$), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*), work(*)
                     
           complex(${ck}$), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
                     
        ! ===================================================================
           
           ! Local Scalars 
           character :: transt, signst
           integer(${ik}$) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, itauq2, j, lbbcsdwork, &
           lbbcsdworkmin, lbbcsdworkopt, lorbdbwork, lorbdbworkmin, lorbdbworkopt, lorglqwork, &
           lorglqworkmin, lorglqworkopt, lorgqrwork, lorgqrworkmin, lorgqrworkopt, lworkmin, &
                     lworkopt, p1, q1
           logical(lk) :: colmajor, defaultsigns, lquery, wantu1, wantu2, wantv1t, wantv2t
           integer(${ik}$) :: lrworkmin, lrworkopt
           logical(lk) :: lrquery
           ! Intrinsic Functions
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           defaultsigns = .not. stdlib_lsame( signs, 'O' )
           lquery = lwork == -1_${ik}$
           lrquery = lrwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -7_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -8_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -9_${ik}$
           else if ( colmajor .and.  ldx11 < max( 1_${ik}$, p ) ) then
             info = -11_${ik}$
           else if (.not. colmajor .and. ldx11 < max( 1_${ik}$, q ) ) then
             info = -11_${ik}$
           else if (colmajor .and. ldx12 < max( 1_${ik}$, p ) ) then
             info = -13_${ik}$
           else if (.not. colmajor .and. ldx12 < max( 1_${ik}$, m-q ) ) then
             info = -13_${ik}$
           else if (colmajor .and. ldx21 < max( 1_${ik}$, m-p ) ) then
             info = -15_${ik}$
           else if (.not. colmajor .and. ldx21 < max( 1_${ik}$, q ) ) then
             info = -15_${ik}$
           else if (colmajor .and. ldx22 < max( 1_${ik}$, m-p ) ) then
             info = -17_${ik}$
           else if (.not. colmajor .and. ldx22 < max( 1_${ik}$, m-q ) ) then
             info = -17_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -20_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -22_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -24_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -26_${ik}$
           end if
           ! work with transpose if convenient
           if( info == 0_${ik}$ .and. min( p, m-p ) < min( q, m-q ) ) then
              if( colmajor ) then
                 transt = 'T'
              else
                 transt = 'N'
              end if
              if( defaultsigns ) then
                 signst = 'O'
              else
                 signst = 'D'
              end if
              call stdlib${ii}$_${ci}$uncsd( jobv1t, jobv2t, jobu1, jobu2, transt, signst, m,q, p, x11, &
              ldx11, x21, ldx21, x12, ldx12, x22,ldx22, theta, v1t, ldv1t, v2t, ldv2t, u1, ldu1,&
                        u2, ldu2, work, lwork, rwork, lrwork, iwork,info )
              return
           end if
           ! work with permutation [ 0 i; i 0 ] * x * [ 0 i; i 0 ] if
           ! convenient
           if( info == 0_${ik}$ .and. m-q < q ) then
              if( defaultsigns ) then
                 signst = 'O'
              else
                 signst = 'D'
              end if
              call stdlib${ii}$_${ci}$uncsd( jobu2, jobu1, jobv2t, jobv1t, trans, signst, m,m-p, m-q, x22, &
              ldx22, x21, ldx21, x12, ldx12, x11,ldx11, theta, u2, ldu2, u1, ldu1, v2t, ldv2t, &
                        v1t,ldv1t, work, lwork, rwork, lrwork, iwork, info )
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ! real workspace
              iphi = 2_${ik}$
              ib11d = iphi + max( 1_${ik}$, q - 1_${ik}$ )
              ib11e = ib11d + max( 1_${ik}$, q )
              ib12d = ib11e + max( 1_${ik}$, q - 1_${ik}$ )
              ib12e = ib12d + max( 1_${ik}$, q )
              ib21d = ib12e + max( 1_${ik}$, q - 1_${ik}$ )
              ib21e = ib21d + max( 1_${ik}$, q )
              ib22d = ib21e + max( 1_${ik}$, q - 1_${ik}$ )
              ib22e = ib22d + max( 1_${ik}$, q )
              ibbcsd = ib22e + max( 1_${ik}$, q - 1_${ik}$ )
              call stdlib${ii}$_${ci}$bbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, theta, u1, &
              ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, theta, theta, theta, theta, theta,theta, &
                        theta, theta, rwork, -1_${ik}$, childinfo )
              lbbcsdworkopt = int( rwork(1_${ik}$),KIND=${ik}$)
              lbbcsdworkmin = lbbcsdworkopt
              lrworkopt = ibbcsd + lbbcsdworkopt - 1_${ik}$
              lrworkmin = ibbcsd + lbbcsdworkmin - 1_${ik}$
              rwork(1_${ik}$) = lrworkopt
              ! complex workspace
              itaup1 = 2_${ik}$
              itaup2 = itaup1 + max( 1_${ik}$, p )
              itauq1 = itaup2 + max( 1_${ik}$, m - p )
              itauq2 = itauq1 + max( 1_${ik}$, q )
              iorgqr = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_${ci}$ungqr( m-q, m-q, m-q, u1, max(1_${ik}$,m-q), u1, work, -1_${ik}$,childinfo )
              lorgqrworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorgqrworkmin = max( 1_${ik}$, m - q )
              iorglq = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_${ci}$unglq( m-q, m-q, m-q, u1, max(1_${ik}$,m-q), u1, work, -1_${ik}$,childinfo )
              lorglqworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorglqworkmin = max( 1_${ik}$, m - q )
              iorbdb = itauq2 + max( 1_${ik}$, m - q )
              call stdlib${ii}$_${ci}$unbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
                        ldx22, theta, theta, u1, u2,v1t, v2t, work, -1_${ik}$, childinfo )
              lorbdbworkopt = int( work(1_${ik}$),KIND=${ik}$)
              lorbdbworkmin = lorbdbworkopt
              lworkopt = max( iorgqr + lorgqrworkopt, iorglq + lorglqworkopt,iorbdb + &
                        lorbdbworkopt ) - 1_${ik}$
              lworkmin = max( iorgqr + lorgqrworkmin, iorglq + lorglqworkmin,iorbdb + &
                        lorbdbworkmin ) - 1_${ik}$
              work(1_${ik}$) = max(lworkopt,lworkmin)
              if( lwork < lworkmin.and. .not. ( lquery .or. lrquery ) ) then
                 info = -22_${ik}$
              else if( lrwork < lrworkmin.and. .not. ( lquery .or. lrquery ) ) then
                 info = -24_${ik}$
              else
                 lorgqrwork = lwork - iorgqr + 1_${ik}$
                 lorglqwork = lwork - iorglq + 1_${ik}$
                 lorbdbwork = lwork - iorbdb + 1_${ik}$
                 lbbcsdwork = lrwork - ibbcsd + 1_${ik}$
              end if
           end if
           ! abort if any illegal arguments
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNCSD', -info )
              return
           else if( lquery .or. lrquery ) then
              return
           end if
           ! transform to bidiagonal block form
           call stdlib${ii}$_${ci}$unbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12, x21,ldx21, x22, &
           ldx22, theta, rwork(iphi), work(itaup1),work(itaup2), work(itauq1), work(itauq2),work(&
                     iorbdb), lorbdbwork, childinfo )
           ! accumulate householder reflectors
           if( colmajor ) then
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$lacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_${ci}$ungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqrwork, &
                           info)
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$lacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_${ci}$ungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqrwork,&
                            info )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$lacpy( 'U', q-1, q-1, x11(1_${ik}$,2_${ik}$), ldx11, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 v1t(1_${ik}$, 1_${ik}$) = cone
                 do j = 2, q
                    v1t(1_${ik}$,j) = czero
                    v1t(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_${ci}$unglq( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorglq), &
                           lorglqwork, info )
              end if
              if( wantv2t .and. m-q > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$lacpy( 'U', p, m-q, x12, ldx12, v2t, ldv2t )
                 if( m-p > q) then
                    call stdlib${ii}$_${ci}$lacpy( 'U', m-p-q, m-p-q, x22(q+1,p+1), ldx22,v2t(p+1,p+1), &
                              ldv2t )
                 end if
                 if( m > q ) then
                    call stdlib${ii}$_${ci}$unglq( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorglq), &
                              lorglqwork, info )
                 end if
              end if
           else
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$lacpy( 'U', q, p, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_${ci}$unglq( p, p, q, u1, ldu1, work(itaup1), work(iorglq),lorglqwork, &
                           info)
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$lacpy( 'U', q, m-p, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_${ci}$unglq( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorglq), lorglqwork,&
                            info )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$lacpy( 'L', q-1, q-1, x11(2_${ik}$,1_${ik}$), ldx11, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 v1t(1_${ik}$, 1_${ik}$) = cone
                 do j = 2, q
                    v1t(1_${ik}$,j) = czero
                    v1t(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_${ci}$ungqr( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorgqr), &
                           lorgqrwork, info )
              end if
              if( wantv2t .and. m-q > 0_${ik}$ ) then
                 p1 = min( p+1, m )
                 q1 = min( q+1, m )
                 call stdlib${ii}$_${ci}$lacpy( 'L', m-q, p, x12, ldx12, v2t, ldv2t )
                 if( m > p+q ) then
                    call stdlib${ii}$_${ci}$lacpy( 'L', m-p-q, m-p-q, x22(p1,q1), ldx22,v2t(p+1,p+1), ldv2t )
                              
                 end if
                 call stdlib${ii}$_${ci}$ungqr( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorgqr), &
                           lorgqrwork, info )
              end if
           end if
           ! compute the csd of the matrix in bidiagonal-block form
           call stdlib${ii}$_${ci}$bbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q, theta,rwork(iphi), &
           u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, rwork(ib11d), rwork(ib11e), rwork(ib12d),&
           rwork(ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd),&
                     lbbcsdwork, info )
           ! permute rows and columns to place identity submatrices in top-
           ! left corner of (1,1)-block and/or bottom-right corner of (1,2)-
           ! block and/or bottom-right corner of (2,1)-block and/or top-left
           ! corner of (2,2)-block
           if( q > 0_${ik}$ .and. wantu2 ) then
              do i = 1, q
                 iwork(i) = m - p - q + i
              end do
              do i = q + 1, m - p
                 iwork(i) = i - q
              end do
              if( colmajor ) then
                 call stdlib${ii}$_${ci}$lapmt( .false., m-p, m-p, u2, ldu2, iwork )
              else
                 call stdlib${ii}$_${ci}$lapmr( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           end if
           if( m > 0_${ik}$ .and. wantv2t ) then
              do i = 1, p
                 iwork(i) = m - p - q + i
              end do
              do i = p + 1, m - q
                 iwork(i) = i - p
              end do
              if( .not. colmajor ) then
                 call stdlib${ii}$_${ci}$lapmt( .false., m-q, m-q, v2t, ldv2t, iwork )
              else
                 call stdlib${ii}$_${ci}$lapmr( .false., m-q, m-q, v2t, ldv2t, iwork )
              end if
           end if
           return
           ! end stdlib${ii}$_${ci}$uncsd
     end subroutine stdlib${ii}$_${ci}$uncsd

#:endif
#:endfor



     module subroutine stdlib${ii}$_cuncsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta, &
     !! CUNCSD2BY1 computes the CS decomposition of an M-by-Q matrix X with
     !! orthonormal columns that has been partitioned into a 2-by-1 block
     !! structure:
     !! [  I1 0  0 ]
     !! [  0  C  0 ]
     !! [ X11 ]   [ U1 |    ] [  0  0  0 ]
     !! X = [-----] = [---------] [----------] V1**T .
     !! [ X21 ]   [    | U2 ] [  0  0  0 ]
     !! [  0  S  0 ]
     !! [  0  0  I2]
     !! X11 is P-by-Q. The unitary matrices U1, U2, and V1 are P-by-P,
     !! (M-P)-by-(M-P), and Q-by-Q, respectively. C and S are R-by-R
     !! nonnegative diagonal matrices satisfying C^2 + S^2 = I, in which
     !! R = MIN(P,M-P,Q,M-Q). I1 is a K1-by-K1 identity matrix and I2 is a
     !! K2-by-K2 identity matrix, where K1 = MAX(Q+P-M,0), K2 = MAX(Q-P,0).
               u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, rwork, lrwork, iwork,info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, lwork, ldx11, ldx21, m, p, q
           integer(${ik}$), intent(in) :: lrwork
           integer(${ik}$) :: lrworkmin, lrworkopt
           ! Array Arguments 
           real(sp), intent(out) :: rwork(*)
           real(sp), intent(out) :: theta(*)
           complex(sp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), work(*)
           complex(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
           integer(${ik}$), intent(out) :: iwork(*)
        ! =====================================================================
           
           ! Local Scalars 
           integer(${ik}$) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, j, lbbcsd, lorbdb, &
           lorglq, lorglqmin, lorglqopt, lorgqr, lorgqrmin, lorgqropt, lworkmin, lworkopt, &
                     r
           logical(lk) :: lquery, wantu1, wantu2, wantv1t
           ! Local Arrays 
           real(sp) :: dum(1_${ik}$)
           complex(sp) :: cdum(1_${ik}$,1_${ik}$)
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           lquery = ( lwork==-1_${ik}$ ) .or. ( lrwork==-1_${ik}$ )
           if( m < 0_${ik}$ ) then
              info = -4_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -5_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -6_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -8_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -10_${ik}$
           else if( wantu1 .and. ldu1 < max( 1_${ik}$, p ) ) then
              info = -13_${ik}$
           else if( wantu2 .and. ldu2 < max( 1_${ik}$, m - p ) ) then
              info = -15_${ik}$
           else if( wantv1t .and. ldv1t < max( 1_${ik}$, q ) ) then
              info = -17_${ik}$
           end if
           r = min( p, m-p, q, m-q )
           ! compute workspace
             ! work layout:
           ! |-----------------------------------------|
           ! | lworkopt (1)                            |
           ! |-----------------------------------------|
           ! | taup1 (max(1,p))                        |
           ! | taup2 (max(1,m-p))                      |
           ! | tauq1 (max(1,q))                        |
           ! |-----------------------------------------|
           ! | stdlib${ii}$_cunbdb work | stdlib${ii}$_cungqr work | stdlib${ii}$_cunglq work |
           ! |             |             |             |
           ! |             |             |             |
           ! |             |             |             |
           ! |             |             |             |
           ! |-----------------------------------------|
             ! rwork layout:
           ! |------------------|
           ! | lrworkopt (1)    |
           ! |------------------|
           ! | phi (max(1,r-1)) |
           ! |------------------|
           ! | b11d (r)         |
           ! | b11e (r-1)       |
           ! | b12d (r)         |
           ! | b12e (r-1)       |
           ! | b21d (r)         |
           ! | b21e (r-1)       |
           ! | b22d (r)         |
           ! | b22e (r-1)       |
           ! | stdlib${ii}$_cbbcsd rwork     |
           ! |------------------|
           if( info == 0_${ik}$ ) then
              iphi = 2_${ik}$
              ib11d = iphi + max( 1_${ik}$, r-1 )
              ib11e = ib11d + max( 1_${ik}$, r )
              ib12d = ib11e + max( 1_${ik}$, r - 1_${ik}$ )
              ib12e = ib12d + max( 1_${ik}$, r )
              ib21d = ib12e + max( 1_${ik}$, r - 1_${ik}$ )
              ib21e = ib21d + max( 1_${ik}$, r )
              ib22d = ib21e + max( 1_${ik}$, r - 1_${ik}$ )
              ib22e = ib22d + max( 1_${ik}$, r )
              ibbcsd = ib22e + max( 1_${ik}$, r - 1_${ik}$ )
              itaup1 = 2_${ik}$
              itaup2 = itaup1 + max( 1_${ik}$, p )
              itauq1 = itaup2 + max( 1_${ik}$, m-p )
              iorbdb = itauq1 + max( 1_${ik}$, q )
              iorgqr = itauq1 + max( 1_${ik}$, q )
              iorglq = itauq1 + max( 1_${ik}$, q )
              lorgqrmin = 1_${ik}$
              lorgqropt = 1_${ik}$
              lorglqmin = 1_${ik}$
              lorglqopt = 1_${ik}$
              if( r == q ) then
                 call stdlib${ii}$_cunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,dum, cdum, cdum, &
                           cdum, work, -1_${ik}$,childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_cungqr( p, p, q, u1, ldu1, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 endif
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_cungqr( m-p, m-p, q, u2, ldu2, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_cunglq( q-1, q-1, q-1, v1t, ldv1t,cdum, work(1_${ik}$), -1_${ik}$, childinfo )
                              
                    lorglqmin = max( lorglqmin, q-1 )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_cbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,dum(1_${ik}$), u1, &
                 ldu1, u2, ldu2, v1t, ldv1t, cdum,1_${ik}$, dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
                           1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( rwork(1_${ik}$),KIND=${ik}$)
              else if( r == p ) then
                 call stdlib${ii}$_cunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
                           cdum, work(1_${ik}$), -1_${ik}$, childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_cungqr( p-1, p-1, p-1, u1(2_${ik}$,2_${ik}$), ldu1, cdum, work(1_${ik}$),-1_${ik}$, childinfo &
                              )
                    lorgqrmin = max( lorgqrmin, p-1 )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_cungqr( m-p, m-p, q, u2, ldu2, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_cunglq( q, q, r, v1t, ldv1t, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_cbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,dum, v1t, &
                 ldv1t, cdum, 1_${ik}$, u1, ldu1, u2, ldu2,dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
                           1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( rwork(1_${ik}$),KIND=${ik}$)
              else if( r == m-p ) then
                 call stdlib${ii}$_cunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
                           cdum, work(1_${ik}$), -1_${ik}$, childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_cungqr( p, p, q, u1, ldu1, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_cungqr( m-p-1, m-p-1, m-p-1, u2(2_${ik}$,2_${ik}$), ldu2, cdum,work(1_${ik}$), -1_${ik}$, &
                              childinfo )
                    lorgqrmin = max( lorgqrmin, m-p-1 )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_cunglq( q, q, r, v1t, ldv1t, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_cbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, dum, cdum,&
                  1_${ik}$, v1t, ldv1t, u2, ldu2, u1,ldu1, dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
                            1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( rwork(1_${ik}$),KIND=${ik}$)
              else
                 call stdlib${ii}$_cunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
                           cdum, cdum, work(1_${ik}$), -1_${ik}$, childinfo)
                 lorbdb = m + int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_cungqr( p, p, m-q, u1, ldu1, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_cungqr( m-p, m-p, m-q, u2, ldu2, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_cunglq( q, q, q, v1t, ldv1t, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_cbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, dum, u2, &
                 ldu2, u1, ldu1, cdum, 1_${ik}$, v1t,ldv1t, dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
                           1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( rwork(1_${ik}$),KIND=${ik}$)
              end if
              lrworkmin = ibbcsd+lbbcsd-1
              lrworkopt = lrworkmin
              rwork(1_${ik}$) = lrworkopt
              lworkmin = max( iorbdb+lorbdb-1,iorgqr+lorgqrmin-1,iorglq+lorglqmin-1 )
              lworkopt = max( iorbdb+lorbdb-1,iorgqr+lorgqropt-1,iorglq+lorglqopt-1 )
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                 info = -19_${ik}$
              end if
              if( lrwork < lrworkmin .and. .not.lquery ) then
                 info = -21_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'CUNCSD2BY1', -info )
              return
           else if( lquery ) then
              return
           end if
           lorgqr = lwork-iorgqr+1
           lorglq = lwork-iorglq+1
           ! handle four cases separately: r = q, r = p, r = m-p, and r = m-q,
           ! in which r = min(p,m-p,q,m-q)
           if( r == q ) then
              ! case 1: r = q
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_cunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_clacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_cungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_clacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_cungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 v1t(1_${ik}$,1_${ik}$) = cone
                 do j = 2, q
                    v1t(1_${ik}$,j) = czero
                    v1t(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_clacpy( 'U', q-1, q-1, x21(1_${ik}$,2_${ik}$), ldx21, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 call stdlib${ii}$_cunglq( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorglq), &
                           lorglq, childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_cbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,rwork(iphi), u1, &
              ldu1, u2, ldu2, v1t, ldv1t, cdum,1_${ik}$, rwork(ib11d), rwork(ib11e), rwork(ib12d),rwork(&
              ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd),lrwork-&
                        ibbcsd+1, childinfo )
              ! permute rows and columns to place czero submatrices in
              ! preferred positions
              if( q > 0_${ik}$ .and. wantu2 ) then
                 do i = 1, q
                    iwork(i) = m - p - q + i
                 end do
                 do i = q + 1, m - p
                    iwork(i) = i - q
                 end do
                 call stdlib${ii}$_clapmt( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           else if( r == p ) then
              ! case 2: r = p
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_cunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 u1(1_${ik}$,1_${ik}$) = cone
                 do j = 2, p
                    u1(1_${ik}$,j) = czero
                    u1(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_clacpy( 'L', p-1, p-1, x11(2_${ik}$,1_${ik}$), ldx11, u1(2_${ik}$,2_${ik}$), ldu1 )
                 call stdlib${ii}$_cungqr( p-1, p-1, p-1, u1(2_${ik}$,2_${ik}$), ldu1, work(itaup1),work(iorgqr), &
                           lorgqr, childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_clacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_cungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_clacpy( 'U', p, q, x11, ldx11, v1t, ldv1t )
                 call stdlib${ii}$_cunglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_cbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,rwork(iphi), v1t,&
               ldv1t, cdum, 1_${ik}$, u1, ldu1, u2,ldu2, rwork(ib11d), rwork(ib11e), rwork(ib12d),rwork(&
               ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd), &
                         lbbcsd,childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( q > 0_${ik}$ .and. wantu2 ) then
                 do i = 1, q
                    iwork(i) = m - p - q + i
                 end do
                 do i = q + 1, m - p
                    iwork(i) = i - q
                 end do
                 call stdlib${ii}$_clapmt( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           else if( r == m-p ) then
              ! case 3: r = m-p
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_cunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_clacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_cungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 u2(1_${ik}$,1_${ik}$) = cone
                 do j = 2, m-p
                    u2(1_${ik}$,j) = czero
                    u2(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_clacpy( 'L', m-p-1, m-p-1, x21(2_${ik}$,1_${ik}$), ldx21, u2(2_${ik}$,2_${ik}$),ldu2 )
                 call stdlib${ii}$_cungqr( m-p-1, m-p-1, m-p-1, u2(2_${ik}$,2_${ik}$), ldu2,work(itaup2), work(iorgqr)&
                           , lorgqr, childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_clacpy( 'U', m-p, q, x21, ldx21, v1t, ldv1t )
                 call stdlib${ii}$_cunglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_cbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, rwork(iphi), &
              cdum, 1_${ik}$, v1t, ldv1t, u2, ldu2,u1, ldu1, rwork(ib11d), rwork(ib11e),rwork(ib12d), &
              rwork(ib12e), rwork(ib21d),rwork(ib21e), rwork(ib22d), rwork(ib22e),rwork(ibbcsd), &
                        lbbcsd, childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( q > r ) then
                 do i = 1, r
                    iwork(i) = q - r + i
                 end do
                 do i = r + 1, q
                    iwork(i) = i - r
                 end do
                 if( wantu1 ) then
                    call stdlib${ii}$_clapmt( .false., p, q, u1, ldu1, iwork )
                 end if
                 if( wantv1t ) then
                    call stdlib${ii}$_clapmr( .false., q, q, v1t, ldv1t, iwork )
                 end if
              end if
           else
              ! case 4: r = m-q
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_cunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
              itaup1), work(itaup2),work(itauq1), work(iorbdb), work(iorbdb+m),lorbdb-m, &
                        childinfo )
              ! accumulate householder reflectors
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_ccopy( m-p, work(iorbdb+p), 1_${ik}$, u2, 1_${ik}$ )
              end if
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_ccopy( p, work(iorbdb), 1_${ik}$, u1, 1_${ik}$ )
                 do j = 2, p
                    u1(1_${ik}$,j) = czero
                 end do
                 call stdlib${ii}$_clacpy( 'L', p-1, m-q-1, x11(2_${ik}$,1_${ik}$), ldx11, u1(2_${ik}$,2_${ik}$),ldu1 )
                 call stdlib${ii}$_cungqr( p, p, m-q, u1, ldu1, work(itaup1),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 do j = 2, m-p
                    u2(1_${ik}$,j) = czero
                 end do
                 call stdlib${ii}$_clacpy( 'L', m-p-1, m-q-1, x21(2_${ik}$,1_${ik}$), ldx21, u2(2_${ik}$,2_${ik}$),ldu2 )
                 call stdlib${ii}$_cungqr( m-p, m-p, m-q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_clacpy( 'U', m-q, q, x21, ldx21, v1t, ldv1t )
                 call stdlib${ii}$_clacpy( 'U', p-(m-q), q-(m-q), x11(m-q+1,m-q+1), ldx11,v1t(m-q+1,m-q+&
                           1_${ik}$), ldv1t )
                 call stdlib${ii}$_clacpy( 'U', -p+q, q-p, x21(m-q+1,p+1), ldx21,v1t(p+1,p+1), ldv1t )
                           
                 call stdlib${ii}$_cunglq( q, q, q, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_cbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, rwork(iphi), &
              u2, ldu2, u1, ldu1, cdum, 1_${ik}$,v1t, ldv1t, rwork(ib11d), rwork(ib11e),rwork(ib12d), &
              rwork(ib12e), rwork(ib21d),rwork(ib21e), rwork(ib22d), rwork(ib22e),rwork(ibbcsd), &
                        lbbcsd, childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( p > r ) then
                 do i = 1, r
                    iwork(i) = p - r + i
                 end do
                 do i = r + 1, p
                    iwork(i) = i - r
                 end do
                 if( wantu1 ) then
                    call stdlib${ii}$_clapmt( .false., p, p, u1, ldu1, iwork )
                 end if
                 if( wantv1t ) then
                    call stdlib${ii}$_clapmr( .false., p, q, v1t, ldv1t, iwork )
                 end if
              end if
           end if
           return
     end subroutine stdlib${ii}$_cuncsd2by1

     module subroutine stdlib${ii}$_zuncsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta, &
     !! ZUNCSD2BY1 computes the CS decomposition of an M-by-Q matrix X with
     !! orthonormal columns that has been partitioned into a 2-by-1 block
     !! structure:
     !! [  I1 0  0 ]
     !! [  0  C  0 ]
     !! [ X11 ]   [ U1 |    ] [  0  0  0 ]
     !! X = [-----] = [---------] [----------] V1**T .
     !! [ X21 ]   [    | U2 ] [  0  0  0 ]
     !! [  0  S  0 ]
     !! [  0  0  I2]
     !! X11 is P-by-Q. The unitary matrices U1, U2, and V1 are P-by-P,
     !! (M-P)-by-(M-P), and Q-by-Q, respectively. C and S are R-by-R
     !! nonnegative diagonal matrices satisfying C^2 + S^2 = I, in which
     !! R = MIN(P,M-P,Q,M-Q). I1 is a K1-by-K1 identity matrix and I2 is a
     !! K2-by-K2 identity matrix, where K1 = MAX(Q+P-M,0), K2 = MAX(Q-P,0).
               u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, rwork, lrwork, iwork,info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, lwork, ldx11, ldx21, m, p, q
           integer(${ik}$), intent(in) :: lrwork
           integer(${ik}$) :: lrworkmin, lrworkopt
           ! Array Arguments 
           real(dp), intent(out) :: rwork(*)
           real(dp), intent(out) :: theta(*)
           complex(dp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), work(*)
           complex(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
           integer(${ik}$), intent(out) :: iwork(*)
        ! =====================================================================
           
           ! Local Scalars 
           integer(${ik}$) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, j, lbbcsd, lorbdb, &
           lorglq, lorglqmin, lorglqopt, lorgqr, lorgqrmin, lorgqropt, lworkmin, lworkopt, &
                     r
           logical(lk) :: lquery, wantu1, wantu2, wantv1t
           ! Local Arrays 
           real(dp) :: dum(1_${ik}$)
           complex(dp) :: cdum(1_${ik}$,1_${ik}$)
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           lquery = ( lwork==-1_${ik}$ ) .or. ( lrwork==-1_${ik}$ )
           if( m < 0_${ik}$ ) then
              info = -4_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -5_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -6_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -8_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -10_${ik}$
           else if( wantu1 .and. ldu1 < max( 1_${ik}$, p ) ) then
              info = -13_${ik}$
           else if( wantu2 .and. ldu2 < max( 1_${ik}$, m - p ) ) then
              info = -15_${ik}$
           else if( wantv1t .and. ldv1t < max( 1_${ik}$, q ) ) then
              info = -17_${ik}$
           end if
           r = min( p, m-p, q, m-q )
           ! compute workspace
             ! work layout:
           ! |-----------------------------------------|
           ! | lworkopt (1)                            |
           ! |-----------------------------------------|
           ! | taup1 (max(1,p))                        |
           ! | taup2 (max(1,m-p))                      |
           ! | tauq1 (max(1,q))                        |
           ! |-----------------------------------------|
           ! | stdlib${ii}$_zunbdb work | stdlib${ii}$_zungqr work | stdlib${ii}$_zunglq work |
           ! |             |             |             |
           ! |             |             |             |
           ! |             |             |             |
           ! |             |             |             |
           ! |-----------------------------------------|
             ! rwork layout:
           ! |------------------|
           ! | lrworkopt (1)    |
           ! |------------------|
           ! | phi (max(1,r-1)) |
           ! |------------------|
           ! | b11d (r)         |
           ! | b11e (r-1)       |
           ! | b12d (r)         |
           ! | b12e (r-1)       |
           ! | b21d (r)         |
           ! | b21e (r-1)       |
           ! | b22d (r)         |
           ! | b22e (r-1)       |
           ! | stdlib${ii}$_zbbcsd rwork     |
           ! |------------------|
           if( info == 0_${ik}$ ) then
              iphi = 2_${ik}$
              ib11d = iphi + max( 1_${ik}$, r-1 )
              ib11e = ib11d + max( 1_${ik}$, r )
              ib12d = ib11e + max( 1_${ik}$, r - 1_${ik}$ )
              ib12e = ib12d + max( 1_${ik}$, r )
              ib21d = ib12e + max( 1_${ik}$, r - 1_${ik}$ )
              ib21e = ib21d + max( 1_${ik}$, r )
              ib22d = ib21e + max( 1_${ik}$, r - 1_${ik}$ )
              ib22e = ib22d + max( 1_${ik}$, r )
              ibbcsd = ib22e + max( 1_${ik}$, r - 1_${ik}$ )
              itaup1 = 2_${ik}$
              itaup2 = itaup1 + max( 1_${ik}$, p )
              itauq1 = itaup2 + max( 1_${ik}$, m-p )
              iorbdb = itauq1 + max( 1_${ik}$, q )
              iorgqr = itauq1 + max( 1_${ik}$, q )
              iorglq = itauq1 + max( 1_${ik}$, q )
              lorgqrmin = 1_${ik}$
              lorgqropt = 1_${ik}$
              lorglqmin = 1_${ik}$
              lorglqopt = 1_${ik}$
              if( r == q ) then
                 call stdlib${ii}$_zunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
                           cdum, work, -1_${ik}$, childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_zungqr( p, p, q, u1, ldu1, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 endif
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_zungqr( m-p, m-p, q, u2, ldu2, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_zunglq( q-1, q-1, q-1, v1t, ldv1t,cdum, work(1_${ik}$), -1_${ik}$, childinfo )
                              
                    lorglqmin = max( lorglqmin, q-1 )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_zbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,dum, u1, ldu1,&
                  u2, ldu2, v1t, ldv1t, cdum, 1_${ik}$,dum, dum, dum, dum, dum, dum, dum, dum,rwork(1_${ik}$), -&
                            1_${ik}$, childinfo )
                 lbbcsd = int( rwork(1_${ik}$),KIND=${ik}$)
              else if( r == p ) then
                 call stdlib${ii}$_zunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
                           cdum, work(1_${ik}$), -1_${ik}$, childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_zungqr( p-1, p-1, p-1, u1(2_${ik}$,2_${ik}$), ldu1, cdum, work(1_${ik}$),-1_${ik}$, childinfo &
                              )
                    lorgqrmin = max( lorgqrmin, p-1 )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_zungqr( m-p, m-p, q, u2, ldu2, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_zunglq( q, q, r, v1t, ldv1t, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_zbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,dum, v1t, &
                 ldv1t, cdum, 1_${ik}$, u1, ldu1, u2, ldu2,dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
                           1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( rwork(1_${ik}$),KIND=${ik}$)
              else if( r == m-p ) then
                 call stdlib${ii}$_zunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
                           cdum, work(1_${ik}$), -1_${ik}$, childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_zungqr( p, p, q, u1, ldu1, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_zungqr( m-p-1, m-p-1, m-p-1, u2(2_${ik}$,2_${ik}$), ldu2, cdum,work(1_${ik}$), -1_${ik}$, &
                              childinfo )
                    lorgqrmin = max( lorgqrmin, m-p-1 )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_zunglq( q, q, r, v1t, ldv1t, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_zbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, dum, cdum,&
                  1_${ik}$, v1t, ldv1t, u2, ldu2, u1,ldu1, dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
                            1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( rwork(1_${ik}$),KIND=${ik}$)
              else
                 call stdlib${ii}$_zunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
                           cdum, cdum, work(1_${ik}$), -1_${ik}$, childinfo)
                 lorbdb = m + int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_zungqr( p, p, m-q, u1, ldu1, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_zungqr( m-p, m-p, m-q, u2, ldu2, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_zunglq( q, q, q, v1t, ldv1t, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_zbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, dum, u2, &
                 ldu2, u1, ldu1, cdum, 1_${ik}$, v1t,ldv1t, dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
                           1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( rwork(1_${ik}$),KIND=${ik}$)
              end if
              lrworkmin = ibbcsd+lbbcsd-1
              lrworkopt = lrworkmin
              rwork(1_${ik}$) = lrworkopt
              lworkmin = max( iorbdb+lorbdb-1,iorgqr+lorgqrmin-1,iorglq+lorglqmin-1 )
              lworkopt = max( iorbdb+lorbdb-1,iorgqr+lorgqropt-1,iorglq+lorglqopt-1 )
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                 info = -19_${ik}$
              end if
              if( lrwork < lrworkmin .and. .not.lquery ) then
                 info = -21_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNCSD2BY1', -info )
              return
           else if( lquery ) then
              return
           end if
           lorgqr = lwork-iorgqr+1
           lorglq = lwork-iorglq+1
           ! handle four cases separately: r = q, r = p, r = m-p, and r = m-q,
           ! in which r = min(p,m-p,q,m-q)
           if( r == q ) then
              ! case 1: r = q
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_zunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_zlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_zungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_zlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_zungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 v1t(1_${ik}$,1_${ik}$) = cone
                 do j = 2, q
                    v1t(1_${ik}$,j) = czero
                    v1t(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_zlacpy( 'U', q-1, q-1, x21(1_${ik}$,2_${ik}$), ldx21, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 call stdlib${ii}$_zunglq( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorglq), &
                           lorglq, childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_zbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,rwork(iphi), u1, &
              ldu1, u2, ldu2, v1t, ldv1t, cdum,1_${ik}$, rwork(ib11d), rwork(ib11e), rwork(ib12d),rwork(&
              ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd),lrwork-&
                        ibbcsd+1, childinfo )
              ! permute rows and columns to place czero submatrices in
              ! preferred positions
              if( q > 0_${ik}$ .and. wantu2 ) then
                 do i = 1, q
                    iwork(i) = m - p - q + i
                 end do
                 do i = q + 1, m - p
                    iwork(i) = i - q
                 end do
                 call stdlib${ii}$_zlapmt( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           else if( r == p ) then
              ! case 2: r = p
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_zunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 u1(1_${ik}$,1_${ik}$) = cone
                 do j = 2, p
                    u1(1_${ik}$,j) = czero
                    u1(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_zlacpy( 'L', p-1, p-1, x11(2_${ik}$,1_${ik}$), ldx11, u1(2_${ik}$,2_${ik}$), ldu1 )
                 call stdlib${ii}$_zungqr( p-1, p-1, p-1, u1(2_${ik}$,2_${ik}$), ldu1, work(itaup1),work(iorgqr), &
                           lorgqr, childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_zlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_zungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_zlacpy( 'U', p, q, x11, ldx11, v1t, ldv1t )
                 call stdlib${ii}$_zunglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_zbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,rwork(iphi), v1t,&
               ldv1t, cdum, 1_${ik}$, u1, ldu1, u2,ldu2, rwork(ib11d), rwork(ib11e), rwork(ib12d),rwork(&
               ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd), &
                         lbbcsd,childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( q > 0_${ik}$ .and. wantu2 ) then
                 do i = 1, q
                    iwork(i) = m - p - q + i
                 end do
                 do i = q + 1, m - p
                    iwork(i) = i - q
                 end do
                 call stdlib${ii}$_zlapmt( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           else if( r == m-p ) then
              ! case 3: r = m-p
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_zunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_zlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_zungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 u2(1_${ik}$,1_${ik}$) = cone
                 do j = 2, m-p
                    u2(1_${ik}$,j) = czero
                    u2(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_zlacpy( 'L', m-p-1, m-p-1, x21(2_${ik}$,1_${ik}$), ldx21, u2(2_${ik}$,2_${ik}$),ldu2 )
                 call stdlib${ii}$_zungqr( m-p-1, m-p-1, m-p-1, u2(2_${ik}$,2_${ik}$), ldu2,work(itaup2), work(iorgqr)&
                           , lorgqr, childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_zlacpy( 'U', m-p, q, x21, ldx21, v1t, ldv1t )
                 call stdlib${ii}$_zunglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_zbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, rwork(iphi), &
              cdum, 1_${ik}$, v1t, ldv1t, u2, ldu2,u1, ldu1, rwork(ib11d), rwork(ib11e),rwork(ib12d), &
              rwork(ib12e), rwork(ib21d),rwork(ib21e), rwork(ib22d), rwork(ib22e),rwork(ibbcsd), &
                        lbbcsd, childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( q > r ) then
                 do i = 1, r
                    iwork(i) = q - r + i
                 end do
                 do i = r + 1, q
                    iwork(i) = i - r
                 end do
                 if( wantu1 ) then
                    call stdlib${ii}$_zlapmt( .false., p, q, u1, ldu1, iwork )
                 end if
                 if( wantv1t ) then
                    call stdlib${ii}$_zlapmr( .false., q, q, v1t, ldv1t, iwork )
                 end if
              end if
           else
              ! case 4: r = m-q
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_zunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
              itaup1), work(itaup2),work(itauq1), work(iorbdb), work(iorbdb+m),lorbdb-m, &
                        childinfo )
              ! accumulate householder reflectors
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_zcopy( m-p, work(iorbdb+p), 1_${ik}$, u2, 1_${ik}$ )
              end if
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_zcopy( p, work(iorbdb), 1_${ik}$, u1, 1_${ik}$ )
                 do j = 2, p
                    u1(1_${ik}$,j) = czero
                 end do
                 call stdlib${ii}$_zlacpy( 'L', p-1, m-q-1, x11(2_${ik}$,1_${ik}$), ldx11, u1(2_${ik}$,2_${ik}$),ldu1 )
                 call stdlib${ii}$_zungqr( p, p, m-q, u1, ldu1, work(itaup1),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 do j = 2, m-p
                    u2(1_${ik}$,j) = czero
                 end do
                 call stdlib${ii}$_zlacpy( 'L', m-p-1, m-q-1, x21(2_${ik}$,1_${ik}$), ldx21, u2(2_${ik}$,2_${ik}$),ldu2 )
                 call stdlib${ii}$_zungqr( m-p, m-p, m-q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_zlacpy( 'U', m-q, q, x21, ldx21, v1t, ldv1t )
                 call stdlib${ii}$_zlacpy( 'U', p-(m-q), q-(m-q), x11(m-q+1,m-q+1), ldx11,v1t(m-q+1,m-q+&
                           1_${ik}$), ldv1t )
                 call stdlib${ii}$_zlacpy( 'U', -p+q, q-p, x21(m-q+1,p+1), ldx21,v1t(p+1,p+1), ldv1t )
                           
                 call stdlib${ii}$_zunglq( q, q, q, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_zbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, rwork(iphi), &
              u2, ldu2, u1, ldu1, cdum, 1_${ik}$,v1t, ldv1t, rwork(ib11d), rwork(ib11e),rwork(ib12d), &
              rwork(ib12e), rwork(ib21d),rwork(ib21e), rwork(ib22d), rwork(ib22e),rwork(ibbcsd), &
                        lbbcsd, childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( p > r ) then
                 do i = 1, r
                    iwork(i) = p - r + i
                 end do
                 do i = r + 1, p
                    iwork(i) = i - r
                 end do
                 if( wantu1 ) then
                    call stdlib${ii}$_zlapmt( .false., p, p, u1, ldu1, iwork )
                 end if
                 if( wantv1t ) then
                    call stdlib${ii}$_zlapmr( .false., p, q, v1t, ldv1t, iwork )
                 end if
              end if
           end if
           return
     end subroutine stdlib${ii}$_zuncsd2by1

#:for ck,ct,ci in CMPLX_KINDS_TYPES
#:if not ck in ["sp","dp"]
     module subroutine stdlib${ii}$_${ci}$uncsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta, &
     !! ZUNCSD2BY1: computes the CS decomposition of an M-by-Q matrix X with
     !! orthonormal columns that has been partitioned into a 2-by-1 block
     !! structure:
     !! [  I1 0  0 ]
     !! [  0  C  0 ]
     !! [ X11 ]   [ U1 |    ] [  0  0  0 ]
     !! X = [-----] = [---------] [----------] V1**T .
     !! [ X21 ]   [    | U2 ] [  0  0  0 ]
     !! [  0  S  0 ]
     !! [  0  0  I2]
     !! X11 is P-by-Q. The unitary matrices U1, U2, and V1 are P-by-P,
     !! (M-P)-by-(M-P), and Q-by-Q, respectively. C and S are R-by-R
     !! nonnegative diagonal matrices satisfying C^2 + S^2 = I, in which
     !! R = MIN(P,M-P,Q,M-Q). I1 is a K1-by-K1 identity matrix and I2 is a
     !! K2-by-K2 identity matrix, where K1 = MAX(Q+P-M,0), K2 = MAX(Q-P,0).
               u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, rwork, lrwork, iwork,info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${ck}$, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, lwork, ldx11, ldx21, m, p, q
           integer(${ik}$), intent(in) :: lrwork
           integer(${ik}$) :: lrworkmin, lrworkopt
           ! Array Arguments 
           real(${ck}$), intent(out) :: rwork(*)
           real(${ck}$), intent(out) :: theta(*)
           complex(${ck}$), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), work(*)
           complex(${ck}$), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
           integer(${ik}$), intent(out) :: iwork(*)
        ! =====================================================================
           
           ! Local Scalars 
           integer(${ik}$) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, j, lbbcsd, lorbdb, &
           lorglq, lorglqmin, lorglqopt, lorgqr, lorgqrmin, lorgqropt, lworkmin, lworkopt, &
                     r
           logical(lk) :: lquery, wantu1, wantu2, wantv1t
           ! Local Arrays 
           real(${ck}$) :: dum(1_${ik}$)
           complex(${ck}$) :: cdum(1_${ik}$,1_${ik}$)
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           lquery = ( lwork==-1_${ik}$ ) .or. ( lrwork==-1_${ik}$ )
           if( m < 0_${ik}$ ) then
              info = -4_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -5_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -6_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -8_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -10_${ik}$
           else if( wantu1 .and. ldu1 < max( 1_${ik}$, p ) ) then
              info = -13_${ik}$
           else if( wantu2 .and. ldu2 < max( 1_${ik}$, m - p ) ) then
              info = -15_${ik}$
           else if( wantv1t .and. ldv1t < max( 1_${ik}$, q ) ) then
              info = -17_${ik}$
           end if
           r = min( p, m-p, q, m-q )
           ! compute workspace
             ! work layout:
           ! |-----------------------------------------|
           ! | lworkopt (1)                            |
           ! |-----------------------------------------|
           ! | taup1 (max(1,p))                        |
           ! | taup2 (max(1,m-p))                      |
           ! | tauq1 (max(1,q))                        |
           ! |-----------------------------------------|
           ! | stdlib${ii}$_${ci}$unbdb work | stdlib${ii}$_${ci}$ungqr work | stdlib${ii}$_${ci}$unglq work |
           ! |             |             |             |
           ! |             |             |             |
           ! |             |             |             |
           ! |             |             |             |
           ! |-----------------------------------------|
             ! rwork layout:
           ! |------------------|
           ! | lrworkopt (1)    |
           ! |------------------|
           ! | phi (max(1,r-1)) |
           ! |------------------|
           ! | b11d (r)         |
           ! | b11e (r-1)       |
           ! | b12d (r)         |
           ! | b12e (r-1)       |
           ! | b21d (r)         |
           ! | b21e (r-1)       |
           ! | b22d (r)         |
           ! | b22e (r-1)       |
           ! | stdlib${ii}$_${ci}$bbcsd rwork     |
           ! |------------------|
           if( info == 0_${ik}$ ) then
              iphi = 2_${ik}$
              ib11d = iphi + max( 1_${ik}$, r-1 )
              ib11e = ib11d + max( 1_${ik}$, r )
              ib12d = ib11e + max( 1_${ik}$, r - 1_${ik}$ )
              ib12e = ib12d + max( 1_${ik}$, r )
              ib21d = ib12e + max( 1_${ik}$, r - 1_${ik}$ )
              ib21e = ib21d + max( 1_${ik}$, r )
              ib22d = ib21e + max( 1_${ik}$, r - 1_${ik}$ )
              ib22e = ib22d + max( 1_${ik}$, r )
              ibbcsd = ib22e + max( 1_${ik}$, r - 1_${ik}$ )
              itaup1 = 2_${ik}$
              itaup2 = itaup1 + max( 1_${ik}$, p )
              itauq1 = itaup2 + max( 1_${ik}$, m-p )
              iorbdb = itauq1 + max( 1_${ik}$, q )
              iorgqr = itauq1 + max( 1_${ik}$, q )
              iorglq = itauq1 + max( 1_${ik}$, q )
              lorgqrmin = 1_${ik}$
              lorgqropt = 1_${ik}$
              lorglqmin = 1_${ik}$
              lorglqopt = 1_${ik}$
              if( r == q ) then
                 call stdlib${ii}$_${ci}$unbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
                           cdum, work, -1_${ik}$, childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ci}$ungqr( p, p, q, u1, ldu1, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 endif
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ci}$ungqr( m-p, m-p, q, u2, ldu2, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_${ci}$unglq( q-1, q-1, q-1, v1t, ldv1t,cdum, work(1_${ik}$), -1_${ik}$, childinfo )
                              
                    lorglqmin = max( lorglqmin, q-1 )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_${ci}$bbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,dum, u1, ldu1,&
                  u2, ldu2, v1t, ldv1t, cdum, 1_${ik}$,dum, dum, dum, dum, dum, dum, dum, dum,rwork(1_${ik}$), -&
                            1_${ik}$, childinfo )
                 lbbcsd = int( rwork(1_${ik}$),KIND=${ik}$)
              else if( r == p ) then
                 call stdlib${ii}$_${ci}$unbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
                           cdum, work(1_${ik}$), -1_${ik}$, childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ci}$ungqr( p-1, p-1, p-1, u1(2_${ik}$,2_${ik}$), ldu1, cdum, work(1_${ik}$),-1_${ik}$, childinfo &
                              )
                    lorgqrmin = max( lorgqrmin, p-1 )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ci}$ungqr( m-p, m-p, q, u2, ldu2, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_${ci}$unglq( q, q, r, v1t, ldv1t, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_${ci}$bbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,dum, v1t, &
                 ldv1t, cdum, 1_${ik}$, u1, ldu1, u2, ldu2,dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
                           1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( rwork(1_${ik}$),KIND=${ik}$)
              else if( r == m-p ) then
                 call stdlib${ii}$_${ci}$unbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
                           cdum, work(1_${ik}$), -1_${ik}$, childinfo )
                 lorbdb = int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ci}$ungqr( p, p, q, u1, ldu1, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ci}$ungqr( m-p-1, m-p-1, m-p-1, u2(2_${ik}$,2_${ik}$), ldu2, cdum,work(1_${ik}$), -1_${ik}$, &
                              childinfo )
                    lorgqrmin = max( lorgqrmin, m-p-1 )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_${ci}$unglq( q, q, r, v1t, ldv1t, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_${ci}$bbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, dum, cdum,&
                  1_${ik}$, v1t, ldv1t, u2, ldu2, u1,ldu1, dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
                            1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( rwork(1_${ik}$),KIND=${ik}$)
              else
                 call stdlib${ii}$_${ci}$unbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
                           cdum, cdum, work(1_${ik}$), -1_${ik}$, childinfo)
                 lorbdb = m + int( work(1_${ik}$),KIND=${ik}$)
                 if( wantu1 .and. p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ci}$ungqr( p, p, m-q, u1, ldu1, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorgqrmin = max( lorgqrmin, p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantu2 .and. m-p > 0_${ik}$ ) then
                    call stdlib${ii}$_${ci}$ungqr( m-p, m-p, m-q, u2, ldu2, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                              
                    lorgqrmin = max( lorgqrmin, m-p )
                    lorgqropt = max( lorgqropt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 if( wantv1t .and. q > 0_${ik}$ ) then
                    call stdlib${ii}$_${ci}$unglq( q, q, q, v1t, ldv1t, cdum, work(1_${ik}$), -1_${ik}$,childinfo )
                    lorglqmin = max( lorglqmin, q )
                    lorglqopt = max( lorglqopt, int( work(1_${ik}$),KIND=${ik}$) )
                 end if
                 call stdlib${ii}$_${ci}$bbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, dum, u2, &
                 ldu2, u1, ldu1, cdum, 1_${ik}$, v1t,ldv1t, dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
                           1_${ik}$), -1_${ik}$, childinfo )
                 lbbcsd = int( rwork(1_${ik}$),KIND=${ik}$)
              end if
              lrworkmin = ibbcsd+lbbcsd-1
              lrworkopt = lrworkmin
              rwork(1_${ik}$) = lrworkopt
              lworkmin = max( iorbdb+lorbdb-1,iorgqr+lorgqrmin-1,iorglq+lorglqmin-1 )
              lworkopt = max( iorbdb+lorbdb-1,iorgqr+lorgqropt-1,iorglq+lorglqopt-1 )
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                 info = -19_${ik}$
              end if
              if( lrwork < lrworkmin .and. .not.lquery ) then
                 info = -21_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNCSD2BY1', -info )
              return
           else if( lquery ) then
              return
           end if
           lorgqr = lwork-iorgqr+1
           lorglq = lwork-iorglq+1
           ! handle four cases separately: r = q, r = p, r = m-p, and r = m-q,
           ! in which r = min(p,m-p,q,m-q)
           if( r == q ) then
              ! case 1: r = q
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_${ci}$unbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$lacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_${ci}$ungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$lacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_${ci}$ungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 v1t(1_${ik}$,1_${ik}$) = cone
                 do j = 2, q
                    v1t(1_${ik}$,j) = czero
                    v1t(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_${ci}$lacpy( 'U', q-1, q-1, x21(1_${ik}$,2_${ik}$), ldx21, v1t(2_${ik}$,2_${ik}$),ldv1t )
                 call stdlib${ii}$_${ci}$unglq( q-1, q-1, q-1, v1t(2_${ik}$,2_${ik}$), ldv1t, work(itauq1),work(iorglq), &
                           lorglq, childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_${ci}$bbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,rwork(iphi), u1, &
              ldu1, u2, ldu2, v1t, ldv1t, cdum,1_${ik}$, rwork(ib11d), rwork(ib11e), rwork(ib12d),rwork(&
              ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd),lrwork-&
                        ibbcsd+1, childinfo )
              ! permute rows and columns to place czero submatrices in
              ! preferred positions
              if( q > 0_${ik}$ .and. wantu2 ) then
                 do i = 1, q
                    iwork(i) = m - p - q + i
                 end do
                 do i = q + 1, m - p
                    iwork(i) = i - q
                 end do
                 call stdlib${ii}$_${ci}$lapmt( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           else if( r == p ) then
              ! case 2: r = p
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_${ci}$unbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 u1(1_${ik}$,1_${ik}$) = cone
                 do j = 2, p
                    u1(1_${ik}$,j) = czero
                    u1(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_${ci}$lacpy( 'L', p-1, p-1, x11(2_${ik}$,1_${ik}$), ldx11, u1(2_${ik}$,2_${ik}$), ldu1 )
                 call stdlib${ii}$_${ci}$ungqr( p-1, p-1, p-1, u1(2_${ik}$,2_${ik}$), ldu1, work(itaup1),work(iorgqr), &
                           lorgqr, childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$lacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
                 call stdlib${ii}$_${ci}$ungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$lacpy( 'U', p, q, x11, ldx11, v1t, ldv1t )
                 call stdlib${ii}$_${ci}$unglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_${ci}$bbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,rwork(iphi), v1t,&
               ldv1t, cdum, 1_${ik}$, u1, ldu1, u2,ldu2, rwork(ib11d), rwork(ib11e), rwork(ib12d),rwork(&
               ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd), &
                         lbbcsd,childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( q > 0_${ik}$ .and. wantu2 ) then
                 do i = 1, q
                    iwork(i) = m - p - q + i
                 end do
                 do i = q + 1, m - p
                    iwork(i) = i - q
                 end do
                 call stdlib${ii}$_${ci}$lapmt( .false., m-p, m-p, u2, ldu2, iwork )
              end if
           else if( r == m-p ) then
              ! case 3: r = m-p
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_${ci}$unbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
              ! accumulate householder reflectors
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$lacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
                 call stdlib${ii}$_${ci}$ungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 u2(1_${ik}$,1_${ik}$) = cone
                 do j = 2, m-p
                    u2(1_${ik}$,j) = czero
                    u2(j,1_${ik}$) = czero
                 end do
                 call stdlib${ii}$_${ci}$lacpy( 'L', m-p-1, m-p-1, x21(2_${ik}$,1_${ik}$), ldx21, u2(2_${ik}$,2_${ik}$),ldu2 )
                 call stdlib${ii}$_${ci}$ungqr( m-p-1, m-p-1, m-p-1, u2(2_${ik}$,2_${ik}$), ldu2,work(itaup2), work(iorgqr)&
                           , lorgqr, childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$lacpy( 'U', m-p, q, x21, ldx21, v1t, ldv1t )
                 call stdlib${ii}$_${ci}$unglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_${ci}$bbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, rwork(iphi), &
              cdum, 1_${ik}$, v1t, ldv1t, u2, ldu2,u1, ldu1, rwork(ib11d), rwork(ib11e),rwork(ib12d), &
              rwork(ib12e), rwork(ib21d),rwork(ib21e), rwork(ib22d), rwork(ib22e),rwork(ibbcsd), &
                        lbbcsd, childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( q > r ) then
                 do i = 1, r
                    iwork(i) = q - r + i
                 end do
                 do i = r + 1, q
                    iwork(i) = i - r
                 end do
                 if( wantu1 ) then
                    call stdlib${ii}$_${ci}$lapmt( .false., p, q, u1, ldu1, iwork )
                 end if
                 if( wantv1t ) then
                    call stdlib${ii}$_${ci}$lapmr( .false., q, q, v1t, ldv1t, iwork )
                 end if
              end if
           else
              ! case 4: r = m-q
              ! simultaneously bidiagonalize x11 and x21
              call stdlib${ii}$_${ci}$unbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
              itaup1), work(itaup2),work(itauq1), work(iorbdb), work(iorbdb+m),lorbdb-m, &
                        childinfo )
              ! accumulate householder reflectors
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$copy( m-p, work(iorbdb+p), 1_${ik}$, u2, 1_${ik}$ )
              end if
              if( wantu1 .and. p > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$copy( p, work(iorbdb), 1_${ik}$, u1, 1_${ik}$ )
                 do j = 2, p
                    u1(1_${ik}$,j) = czero
                 end do
                 call stdlib${ii}$_${ci}$lacpy( 'L', p-1, m-q-1, x11(2_${ik}$,1_${ik}$), ldx11, u1(2_${ik}$,2_${ik}$),ldu1 )
                 call stdlib${ii}$_${ci}$ungqr( p, p, m-q, u1, ldu1, work(itaup1),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantu2 .and. m-p > 0_${ik}$ ) then
                 do j = 2, m-p
                    u2(1_${ik}$,j) = czero
                 end do
                 call stdlib${ii}$_${ci}$lacpy( 'L', m-p-1, m-q-1, x21(2_${ik}$,1_${ik}$), ldx21, u2(2_${ik}$,2_${ik}$),ldu2 )
                 call stdlib${ii}$_${ci}$ungqr( m-p, m-p, m-q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
                           childinfo )
              end if
              if( wantv1t .and. q > 0_${ik}$ ) then
                 call stdlib${ii}$_${ci}$lacpy( 'U', m-q, q, x21, ldx21, v1t, ldv1t )
                 call stdlib${ii}$_${ci}$lacpy( 'U', p-(m-q), q-(m-q), x11(m-q+1,m-q+1), ldx11,v1t(m-q+1,m-q+&
                           1_${ik}$), ldv1t )
                 call stdlib${ii}$_${ci}$lacpy( 'U', -p+q, q-p, x21(m-q+1,p+1), ldx21,v1t(p+1,p+1), ldv1t )
                           
                 call stdlib${ii}$_${ci}$unglq( q, q, q, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
                           childinfo )
              end if
              ! simultaneously diagonalize x11 and x21.
              call stdlib${ii}$_${ci}$bbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, rwork(iphi), &
              u2, ldu2, u1, ldu1, cdum, 1_${ik}$,v1t, ldv1t, rwork(ib11d), rwork(ib11e),rwork(ib12d), &
              rwork(ib12e), rwork(ib21d),rwork(ib21e), rwork(ib22d), rwork(ib22e),rwork(ibbcsd), &
                        lbbcsd, childinfo )
              ! permute rows and columns to place identity submatrices in
              ! preferred positions
              if( p > r ) then
                 do i = 1, r
                    iwork(i) = p - r + i
                 end do
                 do i = r + 1, p
                    iwork(i) = i - r
                 end do
                 if( wantu1 ) then
                    call stdlib${ii}$_${ci}$lapmt( .false., p, p, u1, ldu1, iwork )
                 end if
                 if( wantv1t ) then
                    call stdlib${ii}$_${ci}$lapmr( .false., p, q, v1t, ldv1t, iwork )
                 end if
              end if
           end if
           return
     end subroutine stdlib${ii}$_${ci}$uncsd2by1

#:endif
#:endfor



     module subroutine stdlib${ii}$_cunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
     !! CUNBDB simultaneously bidiagonalizes the blocks of an M-by-M
     !! partitioned unitary matrix X:
     !! [ B11 | B12 0  0 ]
     !! [ X11 | X12 ]   [ P1 |    ] [  0  |  0 -I  0 ] [ Q1 |    ]**H
     !! X = [-----------] = [---------] [----------------] [---------]   .
     !! [ X21 | X22 ]   [    | P2 ] [ B21 | B22 0  0 ] [    | Q2 ]
     !! [  0  |  0  0  I ]
     !! X11 is P-by-Q. Q must be no larger than P, M-P, or M-Q. (If this is
     !! not the case, then X must be transposed and/or permuted. This can be
     !! done in constant time using the TRANS and SIGNS options. See CUNCSD
     !! for details.)
     !! The unitary matrices P1, P2, Q1, and Q2 are P-by-P, (M-P)-by-
     !! (M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. They are
     !! represented implicitly by Householder vectors.
     !! B11, B12, B21, and B22 are Q-by-Q bidiagonal matrices represented
     !! implicitly by angles THETA, PHI.
               ldx22, theta, phi, taup1,taup2, tauq1, tauq2, work, lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: signs, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldx11, ldx12, ldx21, ldx22, lwork, m, p, q
           ! Array Arguments 
           real(sp), intent(out) :: phi(*), theta(*)
           complex(sp), intent(out) :: taup1(*), taup2(*), tauq1(*), tauq2(*), work(*)
           complex(sp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
                     
        ! ====================================================================
           ! Parameters 
           
           
           
           ! Local Scalars 
           logical(lk) :: colmajor, lquery
           integer(${ik}$) :: i, lworkmin, lworkopt
           real(sp) :: z1, z2, z3, z4
           ! Intrinsic Functions
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           colmajor = .not. stdlib_lsame( trans, 'T' )
           if( .not. stdlib_lsame( signs, 'O' ) ) then
              z1 = one
              z2 = one
              z3 = one
              z4 = one
           else
              z1 = one
              z2 = -one
              z3 = one
              z4 = -one
           end if
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -3_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -4_${ik}$
           else if( q < 0_${ik}$ .or. q > p .or. q > m-p .or.q > m-q ) then
              info = -5_${ik}$
           else if( colmajor .and. ldx11 < max( 1_${ik}$, p ) ) then
              info = -7_${ik}$
           else if( .not.colmajor .and. ldx11 < max( 1_${ik}$, q ) ) then
              info = -7_${ik}$
           else if( colmajor .and. ldx12 < max( 1_${ik}$, p ) ) then
              info = -9_${ik}$
           else if( .not.colmajor .and. ldx12 < max( 1_${ik}$, m-q ) ) then
              info = -9_${ik}$
           else if( colmajor .and. ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -11_${ik}$
           else if( .not.colmajor .and. ldx21 < max( 1_${ik}$, q ) ) then
              info = -11_${ik}$
           else if( colmajor .and. ldx22 < max( 1_${ik}$, m-p ) ) then
              info = -13_${ik}$
           else if( .not.colmajor .and. ldx22 < max( 1_${ik}$, m-q ) ) then
              info = -13_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              lworkopt = m - q
              lworkmin = m - q
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not. lquery ) then
                 info = -21_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'XORBDB', -info )
              return
           else if( lquery ) then
              return
           end if
           ! handle column-major and row-major separately
           if( colmajor ) then
              ! reduce columns 1, ..., q of x11, x12, x21, and x22
              do i = 1, q
                 if( i == 1_${ik}$ ) then
                    call stdlib${ii}$_cscal( p-i+1, cmplx( z1, 0.0_sp,KIND=sp), x11(i,i), 1_${ik}$ )
                 else
                    call stdlib${ii}$_cscal( p-i+1, cmplx( z1*cos(phi(i-1)), 0.0_sp,KIND=sp),x11(i,i), &
                              1_${ik}$ )
                    call stdlib${ii}$_caxpy( p-i+1, cmplx( -z1*z3*z4*sin(phi(i-1)),0.0_sp,KIND=sp), x12(&
                              i,i-1), 1_${ik}$, x11(i,i), 1_${ik}$ )
                 end if
                 if( i == 1_${ik}$ ) then
                    call stdlib${ii}$_cscal( m-p-i+1, cmplx( z2, 0.0_sp,KIND=sp), x21(i,i), 1_${ik}$ )
                 else
                    call stdlib${ii}$_cscal( m-p-i+1, cmplx( z2*cos(phi(i-1)), 0.0_sp,KIND=sp),x21(i,i),&
                               1_${ik}$ )
                    call stdlib${ii}$_caxpy( m-p-i+1, cmplx( -z2*z3*z4*sin(phi(i-1)),0.0_sp,KIND=sp), &
                              x22(i,i-1), 1_${ik}$, x21(i,i), 1_${ik}$ )
                 end if
                 theta(i) = atan2( stdlib${ii}$_scnrm2( m-p-i+1, x21(i,i), 1_${ik}$ ),stdlib${ii}$_scnrm2( p-i+1, &
                           x11(i,i), 1_${ik}$ ) )
                 if( p > i ) then
                    call stdlib${ii}$_clarfgp( p-i+1, x11(i,i), x11(i+1,i), 1_${ik}$, taup1(i) )
                 else if ( p == i ) then
                    call stdlib${ii}$_clarfgp( p-i+1, x11(i,i), x11(i,i), 1_${ik}$, taup1(i) )
                 end if
                 x11(i,i) = cone
                 if ( m-p > i ) then
                    call stdlib${ii}$_clarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1_${ik}$,taup2(i) )
                 else if ( m-p == i ) then
                    call stdlib${ii}$_clarfgp( m-p-i+1, x21(i,i), x21(i,i), 1_${ik}$,taup2(i) )
                 end if
                 x21(i,i) = cone
                 if ( q > i ) then
                    call stdlib${ii}$_clarf( 'L', p-i+1, q-i, x11(i,i), 1_${ik}$,conjg(taup1(i)), x11(i,i+1), &
                              ldx11, work )
                    call stdlib${ii}$_clarf( 'L', m-p-i+1, q-i, x21(i,i), 1_${ik}$,conjg(taup2(i)), x21(i,i+1),&
                               ldx21, work )
                 end if
                 if ( m-q+1 > i ) then
                    call stdlib${ii}$_clarf( 'L', p-i+1, m-q-i+1, x11(i,i), 1_${ik}$,conjg(taup1(i)), x12(i,i),&
                               ldx12, work )
                    call stdlib${ii}$_clarf( 'L', m-p-i+1, m-q-i+1, x21(i,i), 1_${ik}$,conjg(taup2(i)), x22(i,&
                              i), ldx22, work )
                 end if
                 if( i < q ) then
                    call stdlib${ii}$_cscal( q-i, cmplx( -z1*z3*sin(theta(i)), 0.0_sp,KIND=sp),x11(i,i+&
                              1_${ik}$), ldx11 )
                    call stdlib${ii}$_caxpy( q-i, cmplx( z2*z3*cos(theta(i)), 0.0_sp,KIND=sp),x21(i,i+1)&
                              , ldx21, x11(i,i+1), ldx11 )
                 end if
                 call stdlib${ii}$_cscal( m-q-i+1, cmplx( -z1*z4*sin(theta(i)), 0.0_sp,KIND=sp),x12(i,i)&
                           , ldx12 )
                 call stdlib${ii}$_caxpy( m-q-i+1, cmplx( z2*z4*cos(theta(i)), 0.0_sp,KIND=sp),x22(i,i),&
                            ldx22, x12(i,i), ldx12 )
                 if( i < q )phi(i) = atan2( stdlib${ii}$_scnrm2( q-i, x11(i,i+1), ldx11 ),stdlib${ii}$_scnrm2(&
                            m-q-i+1, x12(i,i), ldx12 ) )
                 if( i < q ) then
                    call stdlib${ii}$_clacgv( q-i, x11(i,i+1), ldx11 )
                    if ( i == q-1 ) then
                       call stdlib${ii}$_clarfgp( q-i, x11(i,i+1), x11(i,i+1), ldx11,tauq1(i) )
                    else
                       call stdlib${ii}$_clarfgp( q-i, x11(i,i+1), x11(i,i+2), ldx11,tauq1(i) )
                    end if
                    x11(i,i+1) = cone
                 end if
                 if ( m-q+1 > i ) then
                    call stdlib${ii}$_clacgv( m-q-i+1, x12(i,i), ldx12 )
                    if ( m-q == i ) then
                       call stdlib${ii}$_clarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
                    else
                       call stdlib${ii}$_clarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
                                 
                    end if
                 end if
                 x12(i,i) = cone
                 if( i < q ) then
                    call stdlib${ii}$_clarf( 'R', p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x11(i+1,i+1), &
                              ldx11, work )
                    call stdlib${ii}$_clarf( 'R', m-p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x21(i+1,i+1), &
                              ldx21, work )
                 end if
                 if ( p > i ) then
                    call stdlib${ii}$_clarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
                              ldx12, work )
                 end if
                 if ( m-p > i ) then
                    call stdlib${ii}$_clarf( 'R', m-p-i, m-q-i+1, x12(i,i), ldx12,tauq2(i), x22(i+1,i), &
                              ldx22, work )
                 end if
                 if( i < q )call stdlib${ii}$_clacgv( q-i, x11(i,i+1), ldx11 )
                 call stdlib${ii}$_clacgv( m-q-i+1, x12(i,i), ldx12 )
              end do
              ! reduce columns q + 1, ..., p of x12, x22
              do i = q + 1, p
                 call stdlib${ii}$_cscal( m-q-i+1, cmplx( -z1*z4, 0.0_sp,KIND=sp), x12(i,i),ldx12 )
                           
                 call stdlib${ii}$_clacgv( m-q-i+1, x12(i,i), ldx12 )
                 if ( i >= m-q ) then
                    call stdlib${ii}$_clarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
                 else
                    call stdlib${ii}$_clarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
                 end if
                 x12(i,i) = cone
                 if ( p > i ) then
                    call stdlib${ii}$_clarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
                              ldx12, work )
                 end if
                 if( m-p-q >= 1_${ik}$ )call stdlib${ii}$_clarf( 'R', m-p-q, m-q-i+1, x12(i,i), ldx12,tauq2(i),&
                            x22(q+1,i), ldx22, work )
                 call stdlib${ii}$_clacgv( m-q-i+1, x12(i,i), ldx12 )
              end do
              ! reduce columns p + 1, ..., m - q of x12, x22
              do i = 1, m - p - q
                 call stdlib${ii}$_cscal( m-p-q-i+1, cmplx( z2*z4, 0.0_sp,KIND=sp),x22(q+i,p+i), ldx22 )
                           
                 call stdlib${ii}$_clacgv( m-p-q-i+1, x22(q+i,p+i), ldx22 )
                 call stdlib${ii}$_clarfgp( m-p-q-i+1, x22(q+i,p+i), x22(q+i,p+i+1),ldx22, tauq2(p+i) )
                           
                 x22(q+i,p+i) = cone
                 call stdlib${ii}$_clarf( 'R', m-p-q-i, m-p-q-i+1, x22(q+i,p+i), ldx22,tauq2(p+i), x22(&
                           q+i+1,p+i), ldx22, work )
                 call stdlib${ii}$_clacgv( m-p-q-i+1, x22(q+i,p+i), ldx22 )
              end do
           else
              ! reduce columns 1, ..., q of x11, x12, x21, x22
              do i = 1, q
                 if( i == 1_${ik}$ ) then
                    call stdlib${ii}$_cscal( p-i+1, cmplx( z1, 0.0_sp,KIND=sp), x11(i,i),ldx11 )
                 else
                    call stdlib${ii}$_cscal( p-i+1, cmplx( z1*cos(phi(i-1)), 0.0_sp,KIND=sp),x11(i,i), &
                              ldx11 )
                    call stdlib${ii}$_caxpy( p-i+1, cmplx( -z1*z3*z4*sin(phi(i-1)),0.0_sp,KIND=sp), x12(&
                              i-1,i), ldx12, x11(i,i), ldx11 )
                 end if
                 if( i == 1_${ik}$ ) then
                    call stdlib${ii}$_cscal( m-p-i+1, cmplx( z2, 0.0_sp,KIND=sp), x21(i,i),ldx21 )
                              
                 else
                    call stdlib${ii}$_cscal( m-p-i+1, cmplx( z2*cos(phi(i-1)), 0.0_sp,KIND=sp),x21(i,i),&
                               ldx21 )
                    call stdlib${ii}$_caxpy( m-p-i+1, cmplx( -z2*z3*z4*sin(phi(i-1)),0.0_sp,KIND=sp), &
                              x22(i-1,i), ldx22, x21(i,i), ldx21 )
                 end if
                 theta(i) = atan2( stdlib${ii}$_scnrm2( m-p-i+1, x21(i,i), ldx21 ),stdlib${ii}$_scnrm2( p-i+1,&
                            x11(i,i), ldx11 ) )
                 call stdlib${ii}$_clacgv( p-i+1, x11(i,i), ldx11 )
                 call stdlib${ii}$_clacgv( m-p-i+1, x21(i,i), ldx21 )
                 call stdlib${ii}$_clarfgp( p-i+1, x11(i,i), x11(i,i+1), ldx11, taup1(i) )
                 x11(i,i) = cone
                 if ( i == m-p ) then
                    call stdlib${ii}$_clarfgp( m-p-i+1, x21(i,i), x21(i,i), ldx21,taup2(i) )
                 else
                    call stdlib${ii}$_clarfgp( m-p-i+1, x21(i,i), x21(i,i+1), ldx21,taup2(i) )
                 end if
                 x21(i,i) = cone
                 call stdlib${ii}$_clarf( 'R', q-i, p-i+1, x11(i,i), ldx11, taup1(i),x11(i+1,i), ldx11, &
                           work )
                 call stdlib${ii}$_clarf( 'R', m-q-i+1, p-i+1, x11(i,i), ldx11, taup1(i),x12(i,i), &
                           ldx12, work )
                 call stdlib${ii}$_clarf( 'R', q-i, m-p-i+1, x21(i,i), ldx21, taup2(i),x21(i+1,i), &
                           ldx21, work )
                 call stdlib${ii}$_clarf( 'R', m-q-i+1, m-p-i+1, x21(i,i), ldx21,taup2(i), x22(i,i), &
                           ldx22, work )
                 call stdlib${ii}$_clacgv( p-i+1, x11(i,i), ldx11 )
                 call stdlib${ii}$_clacgv( m-p-i+1, x21(i,i), ldx21 )
                 if( i < q ) then
                    call stdlib${ii}$_cscal( q-i, cmplx( -z1*z3*sin(theta(i)), 0.0_sp,KIND=sp),x11(i+1,&
                              i), 1_${ik}$ )
                    call stdlib${ii}$_caxpy( q-i, cmplx( z2*z3*cos(theta(i)), 0.0_sp,KIND=sp),x21(i+1,i)&
                              , 1_${ik}$, x11(i+1,i), 1_${ik}$ )
                 end if
                 call stdlib${ii}$_cscal( m-q-i+1, cmplx( -z1*z4*sin(theta(i)), 0.0_sp,KIND=sp),x12(i,i)&
                           , 1_${ik}$ )
                 call stdlib${ii}$_caxpy( m-q-i+1, cmplx( z2*z4*cos(theta(i)), 0.0_sp,KIND=sp),x22(i,i),&
                            1_${ik}$, x12(i,i), 1_${ik}$ )
                 if( i < q )phi(i) = atan2( stdlib${ii}$_scnrm2( q-i, x11(i+1,i), 1_${ik}$ ),stdlib${ii}$_scnrm2( m-&
                           q-i+1, x12(i,i), 1_${ik}$ ) )
                 if( i < q ) then
                    call stdlib${ii}$_clarfgp( q-i, x11(i+1,i), x11(i+2,i), 1_${ik}$, tauq1(i) )
                    x11(i+1,i) = cone
                 end if
                 call stdlib${ii}$_clarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1_${ik}$, tauq2(i) )
                 x12(i,i) = cone
                 if( i < q ) then
                    call stdlib${ii}$_clarf( 'L', q-i, p-i, x11(i+1,i), 1_${ik}$,conjg(tauq1(i)), x11(i+1,i+1),&
                               ldx11, work )
                    call stdlib${ii}$_clarf( 'L', q-i, m-p-i, x11(i+1,i), 1_${ik}$,conjg(tauq1(i)), x21(i+1,i+&
                              1_${ik}$), ldx21, work )
                 end if
                 call stdlib${ii}$_clarf( 'L', m-q-i+1, p-i, x12(i,i), 1_${ik}$, conjg(tauq2(i)),x12(i,i+1), &
                           ldx12, work )
                 if ( m-p > i ) then
                    call stdlib${ii}$_clarf( 'L', m-q-i+1, m-p-i, x12(i,i), 1_${ik}$,conjg(tauq2(i)), x22(i,i+&
                              1_${ik}$), ldx22, work )
                 end if
              end do
              ! reduce columns q + 1, ..., p of x12, x22
              do i = q + 1, p
                 call stdlib${ii}$_cscal( m-q-i+1, cmplx( -z1*z4, 0.0_sp,KIND=sp), x12(i,i), 1_${ik}$ )
                 call stdlib${ii}$_clarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1_${ik}$, tauq2(i) )
                 x12(i,i) = cone
                 if ( p > i ) then
                    call stdlib${ii}$_clarf( 'L', m-q-i+1, p-i, x12(i,i), 1_${ik}$,conjg(tauq2(i)), x12(i,i+1),&
                               ldx12, work )
                 end if
                 if( m-p-q >= 1_${ik}$ )call stdlib${ii}$_clarf( 'L', m-q-i+1, m-p-q, x12(i,i), 1_${ik}$,conjg(tauq2(&
                           i)), x22(i,q+1), ldx22, work )
              end do
              ! reduce columns p + 1, ..., m - q of x12, x22
              do i = 1, m - p - q
                 call stdlib${ii}$_cscal( m-p-q-i+1, cmplx( z2*z4, 0.0_sp,KIND=sp),x22(p+i,q+i), 1_${ik}$ )
                           
                 call stdlib${ii}$_clarfgp( m-p-q-i+1, x22(p+i,q+i), x22(p+i+1,q+i), 1_${ik}$,tauq2(p+i) )
                           
                 x22(p+i,q+i) = cone
                 if ( m-p-q /= i ) then
                    call stdlib${ii}$_clarf( 'L', m-p-q-i+1, m-p-q-i, x22(p+i,q+i), 1_${ik}$,conjg(tauq2(p+i)),&
                               x22(p+i,q+i+1), ldx22,work )
                 end if
              end do
           end if
           return
     end subroutine stdlib${ii}$_cunbdb

     module subroutine stdlib${ii}$_zunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
     !! ZUNBDB simultaneously bidiagonalizes the blocks of an M-by-M
     !! partitioned unitary matrix X:
     !! [ B11 | B12 0  0 ]
     !! [ X11 | X12 ]   [ P1 |    ] [  0  |  0 -I  0 ] [ Q1 |    ]**H
     !! X = [-----------] = [---------] [----------------] [---------]   .
     !! [ X21 | X22 ]   [    | P2 ] [ B21 | B22 0  0 ] [    | Q2 ]
     !! [  0  |  0  0  I ]
     !! X11 is P-by-Q. Q must be no larger than P, M-P, or M-Q. (If this is
     !! not the case, then X must be transposed and/or permuted. This can be
     !! done in constant time using the TRANS and SIGNS options. See ZUNCSD
     !! for details.)
     !! The unitary matrices P1, P2, Q1, and Q2 are P-by-P, (M-P)-by-
     !! (M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. They are
     !! represented implicitly by Householder vectors.
     !! B11, B12, B21, and B22 are Q-by-Q bidiagonal matrices represented
     !! implicitly by angles THETA, PHI.
               ldx22, theta, phi, taup1,taup2, tauq1, tauq2, work, lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: signs, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldx11, ldx12, ldx21, ldx22, lwork, m, p, q
           ! Array Arguments 
           real(dp), intent(out) :: phi(*), theta(*)
           complex(dp), intent(out) :: taup1(*), taup2(*), tauq1(*), tauq2(*), work(*)
           complex(dp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
                     
        ! ====================================================================
           ! Parameters 
           
           
           
           ! Local Scalars 
           logical(lk) :: colmajor, lquery
           integer(${ik}$) :: i, lworkmin, lworkopt
           real(dp) :: z1, z2, z3, z4
           ! Intrinsic Functions
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           colmajor = .not. stdlib_lsame( trans, 'T' )
           if( .not. stdlib_lsame( signs, 'O' ) ) then
              z1 = one
              z2 = one
              z3 = one
              z4 = one
           else
              z1 = one
              z2 = -one
              z3 = one
              z4 = -one
           end if
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -3_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -4_${ik}$
           else if( q < 0_${ik}$ .or. q > p .or. q > m-p .or.q > m-q ) then
              info = -5_${ik}$
           else if( colmajor .and. ldx11 < max( 1_${ik}$, p ) ) then
              info = -7_${ik}$
           else if( .not.colmajor .and. ldx11 < max( 1_${ik}$, q ) ) then
              info = -7_${ik}$
           else if( colmajor .and. ldx12 < max( 1_${ik}$, p ) ) then
              info = -9_${ik}$
           else if( .not.colmajor .and. ldx12 < max( 1_${ik}$, m-q ) ) then
              info = -9_${ik}$
           else if( colmajor .and. ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -11_${ik}$
           else if( .not.colmajor .and. ldx21 < max( 1_${ik}$, q ) ) then
              info = -11_${ik}$
           else if( colmajor .and. ldx22 < max( 1_${ik}$, m-p ) ) then
              info = -13_${ik}$
           else if( .not.colmajor .and. ldx22 < max( 1_${ik}$, m-q ) ) then
              info = -13_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              lworkopt = m - q
              lworkmin = m - q
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not. lquery ) then
                 info = -21_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'XORBDB', -info )
              return
           else if( lquery ) then
              return
           end if
           ! handle column-major and row-major separately
           if( colmajor ) then
              ! reduce columns 1, ..., q of x11, x12, x21, and x22
              do i = 1, q
                 if( i == 1_${ik}$ ) then
                    call stdlib${ii}$_zscal( p-i+1, cmplx( z1, 0.0_dp,KIND=dp), x11(i,i), 1_${ik}$ )
                 else
                    call stdlib${ii}$_zscal( p-i+1, cmplx( z1*cos(phi(i-1)), 0.0_dp,KIND=dp),x11(i,i), &
                              1_${ik}$ )
                    call stdlib${ii}$_zaxpy( p-i+1, cmplx( -z1*z3*z4*sin(phi(i-1)),0.0_dp,KIND=dp), x12(&
                              i,i-1), 1_${ik}$, x11(i,i), 1_${ik}$ )
                 end if
                 if( i == 1_${ik}$ ) then
                    call stdlib${ii}$_zscal( m-p-i+1, cmplx( z2, 0.0_dp,KIND=dp), x21(i,i), 1_${ik}$ )
                 else
                    call stdlib${ii}$_zscal( m-p-i+1, cmplx( z2*cos(phi(i-1)), 0.0_dp,KIND=dp),x21(i,i),&
                               1_${ik}$ )
                    call stdlib${ii}$_zaxpy( m-p-i+1, cmplx( -z2*z3*z4*sin(phi(i-1)),0.0_dp,KIND=dp), &
                              x22(i,i-1), 1_${ik}$, x21(i,i), 1_${ik}$ )
                 end if
                 theta(i) = atan2( stdlib${ii}$_dznrm2( m-p-i+1, x21(i,i), 1_${ik}$ ),stdlib${ii}$_dznrm2( p-i+1, &
                           x11(i,i), 1_${ik}$ ) )
                 if( p > i ) then
                    call stdlib${ii}$_zlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1_${ik}$, taup1(i) )
                 else if ( p == i ) then
                    call stdlib${ii}$_zlarfgp( p-i+1, x11(i,i), x11(i,i), 1_${ik}$, taup1(i) )
                 end if
                 x11(i,i) = cone
                 if ( m-p > i ) then
                    call stdlib${ii}$_zlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1_${ik}$,taup2(i) )
                 else if ( m-p == i ) then
                    call stdlib${ii}$_zlarfgp( m-p-i+1, x21(i,i), x21(i,i), 1_${ik}$,taup2(i) )
                 end if
                 x21(i,i) = cone
                 if ( q > i ) then
                    call stdlib${ii}$_zlarf( 'L', p-i+1, q-i, x11(i,i), 1_${ik}$,conjg(taup1(i)), x11(i,i+1), &
                              ldx11, work )
                    call stdlib${ii}$_zlarf( 'L', m-p-i+1, q-i, x21(i,i), 1_${ik}$,conjg(taup2(i)), x21(i,i+1),&
                               ldx21, work )
                 end if
                 if ( m-q+1 > i ) then
                    call stdlib${ii}$_zlarf( 'L', p-i+1, m-q-i+1, x11(i,i), 1_${ik}$,conjg(taup1(i)), x12(i,i),&
                               ldx12, work )
                    call stdlib${ii}$_zlarf( 'L', m-p-i+1, m-q-i+1, x21(i,i), 1_${ik}$,conjg(taup2(i)), x22(i,&
                              i), ldx22, work )
                 end if
                 if( i < q ) then
                    call stdlib${ii}$_zscal( q-i, cmplx( -z1*z3*sin(theta(i)), 0.0_dp,KIND=dp),x11(i,i+&
                              1_${ik}$), ldx11 )
                    call stdlib${ii}$_zaxpy( q-i, cmplx( z2*z3*cos(theta(i)), 0.0_dp,KIND=dp),x21(i,i+1)&
                              , ldx21, x11(i,i+1), ldx11 )
                 end if
                 call stdlib${ii}$_zscal( m-q-i+1, cmplx( -z1*z4*sin(theta(i)), 0.0_dp,KIND=dp),x12(i,i)&
                           , ldx12 )
                 call stdlib${ii}$_zaxpy( m-q-i+1, cmplx( z2*z4*cos(theta(i)), 0.0_dp,KIND=dp),x22(i,i),&
                            ldx22, x12(i,i), ldx12 )
                 if( i < q )phi(i) = atan2( stdlib${ii}$_dznrm2( q-i, x11(i,i+1), ldx11 ),stdlib${ii}$_dznrm2(&
                            m-q-i+1, x12(i,i), ldx12 ) )
                 if( i < q ) then
                    call stdlib${ii}$_zlacgv( q-i, x11(i,i+1), ldx11 )
                    if ( i == q-1 ) then
                       call stdlib${ii}$_zlarfgp( q-i, x11(i,i+1), x11(i,i+1), ldx11,tauq1(i) )
                    else
                       call stdlib${ii}$_zlarfgp( q-i, x11(i,i+1), x11(i,i+2), ldx11,tauq1(i) )
                    end if
                    x11(i,i+1) = cone
                 end if
                 if ( m-q+1 > i ) then
                    call stdlib${ii}$_zlacgv( m-q-i+1, x12(i,i), ldx12 )
                    if ( m-q == i ) then
                       call stdlib${ii}$_zlarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
                    else
                       call stdlib${ii}$_zlarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
                                 
                    end if
                 end if
                 x12(i,i) = cone
                 if( i < q ) then
                    call stdlib${ii}$_zlarf( 'R', p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x11(i+1,i+1), &
                              ldx11, work )
                    call stdlib${ii}$_zlarf( 'R', m-p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x21(i+1,i+1), &
                              ldx21, work )
                 end if
                 if ( p > i ) then
                    call stdlib${ii}$_zlarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
                              ldx12, work )
                 end if
                 if ( m-p > i ) then
                    call stdlib${ii}$_zlarf( 'R', m-p-i, m-q-i+1, x12(i,i), ldx12,tauq2(i), x22(i+1,i), &
                              ldx22, work )
                 end if
                 if( i < q )call stdlib${ii}$_zlacgv( q-i, x11(i,i+1), ldx11 )
                 call stdlib${ii}$_zlacgv( m-q-i+1, x12(i,i), ldx12 )
              end do
              ! reduce columns q + 1, ..., p of x12, x22
              do i = q + 1, p
                 call stdlib${ii}$_zscal( m-q-i+1, cmplx( -z1*z4, 0.0_dp,KIND=dp), x12(i,i),ldx12 )
                           
                 call stdlib${ii}$_zlacgv( m-q-i+1, x12(i,i), ldx12 )
                 if ( i >= m-q ) then
                    call stdlib${ii}$_zlarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
                 else
                    call stdlib${ii}$_zlarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
                 end if
                 x12(i,i) = cone
                 if ( p > i ) then
                    call stdlib${ii}$_zlarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
                              ldx12, work )
                 end if
                 if( m-p-q >= 1_${ik}$ )call stdlib${ii}$_zlarf( 'R', m-p-q, m-q-i+1, x12(i,i), ldx12,tauq2(i),&
                            x22(q+1,i), ldx22, work )
                 call stdlib${ii}$_zlacgv( m-q-i+1, x12(i,i), ldx12 )
              end do
              ! reduce columns p + 1, ..., m - q of x12, x22
              do i = 1, m - p - q
                 call stdlib${ii}$_zscal( m-p-q-i+1, cmplx( z2*z4, 0.0_dp,KIND=dp),x22(q+i,p+i), ldx22 )
                           
                 call stdlib${ii}$_zlacgv( m-p-q-i+1, x22(q+i,p+i), ldx22 )
                 call stdlib${ii}$_zlarfgp( m-p-q-i+1, x22(q+i,p+i), x22(q+i,p+i+1),ldx22, tauq2(p+i) )
                           
                 x22(q+i,p+i) = cone
                 call stdlib${ii}$_zlarf( 'R', m-p-q-i, m-p-q-i+1, x22(q+i,p+i), ldx22,tauq2(p+i), x22(&
                           q+i+1,p+i), ldx22, work )
                 call stdlib${ii}$_zlacgv( m-p-q-i+1, x22(q+i,p+i), ldx22 )
              end do
           else
              ! reduce columns 1, ..., q of x11, x12, x21, x22
              do i = 1, q
                 if( i == 1_${ik}$ ) then
                    call stdlib${ii}$_zscal( p-i+1, cmplx( z1, 0.0_dp,KIND=dp), x11(i,i),ldx11 )
                 else
                    call stdlib${ii}$_zscal( p-i+1, cmplx( z1*cos(phi(i-1)), 0.0_dp,KIND=dp),x11(i,i), &
                              ldx11 )
                    call stdlib${ii}$_zaxpy( p-i+1, cmplx( -z1*z3*z4*sin(phi(i-1)),0.0_dp,KIND=dp), x12(&
                              i-1,i), ldx12, x11(i,i), ldx11 )
                 end if
                 if( i == 1_${ik}$ ) then
                    call stdlib${ii}$_zscal( m-p-i+1, cmplx( z2, 0.0_dp,KIND=dp), x21(i,i),ldx21 )
                              
                 else
                    call stdlib${ii}$_zscal( m-p-i+1, cmplx( z2*cos(phi(i-1)), 0.0_dp,KIND=dp),x21(i,i),&
                               ldx21 )
                    call stdlib${ii}$_zaxpy( m-p-i+1, cmplx( -z2*z3*z4*sin(phi(i-1)),0.0_dp,KIND=dp), &
                              x22(i-1,i), ldx22, x21(i,i), ldx21 )
                 end if
                 theta(i) = atan2( stdlib${ii}$_dznrm2( m-p-i+1, x21(i,i), ldx21 ),stdlib${ii}$_dznrm2( p-i+1,&
                            x11(i,i), ldx11 ) )
                 call stdlib${ii}$_zlacgv( p-i+1, x11(i,i), ldx11 )
                 call stdlib${ii}$_zlacgv( m-p-i+1, x21(i,i), ldx21 )
                 call stdlib${ii}$_zlarfgp( p-i+1, x11(i,i), x11(i,i+1), ldx11, taup1(i) )
                 x11(i,i) = cone
                 if ( i == m-p ) then
                    call stdlib${ii}$_zlarfgp( m-p-i+1, x21(i,i), x21(i,i), ldx21,taup2(i) )
                 else
                    call stdlib${ii}$_zlarfgp( m-p-i+1, x21(i,i), x21(i,i+1), ldx21,taup2(i) )
                 end if
                 x21(i,i) = cone
                 call stdlib${ii}$_zlarf( 'R', q-i, p-i+1, x11(i,i), ldx11, taup1(i),x11(i+1,i), ldx11, &
                           work )
                 call stdlib${ii}$_zlarf( 'R', m-q-i+1, p-i+1, x11(i,i), ldx11, taup1(i),x12(i,i), &
                           ldx12, work )
                 call stdlib${ii}$_zlarf( 'R', q-i, m-p-i+1, x21(i,i), ldx21, taup2(i),x21(i+1,i), &
                           ldx21, work )
                 call stdlib${ii}$_zlarf( 'R', m-q-i+1, m-p-i+1, x21(i,i), ldx21,taup2(i), x22(i,i), &
                           ldx22, work )
                 call stdlib${ii}$_zlacgv( p-i+1, x11(i,i), ldx11 )
                 call stdlib${ii}$_zlacgv( m-p-i+1, x21(i,i), ldx21 )
                 if( i < q ) then
                    call stdlib${ii}$_zscal( q-i, cmplx( -z1*z3*sin(theta(i)), 0.0_dp,KIND=dp),x11(i+1,&
                              i), 1_${ik}$ )
                    call stdlib${ii}$_zaxpy( q-i, cmplx( z2*z3*cos(theta(i)), 0.0_dp,KIND=dp),x21(i+1,i)&
                              , 1_${ik}$, x11(i+1,i), 1_${ik}$ )
                 end if
                 call stdlib${ii}$_zscal( m-q-i+1, cmplx( -z1*z4*sin(theta(i)), 0.0_dp,KIND=dp),x12(i,i)&
                           , 1_${ik}$ )
                 call stdlib${ii}$_zaxpy( m-q-i+1, cmplx( z2*z4*cos(theta(i)), 0.0_dp,KIND=dp),x22(i,i),&
                            1_${ik}$, x12(i,i), 1_${ik}$ )
                 if( i < q )phi(i) = atan2( stdlib${ii}$_dznrm2( q-i, x11(i+1,i), 1_${ik}$ ),stdlib${ii}$_dznrm2( m-&
                           q-i+1, x12(i,i), 1_${ik}$ ) )
                 if( i < q ) then
                    call stdlib${ii}$_zlarfgp( q-i, x11(i+1,i), x11(i+2,i), 1_${ik}$, tauq1(i) )
                    x11(i+1,i) = cone
                 end if
                 call stdlib${ii}$_zlarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1_${ik}$, tauq2(i) )
                 x12(i,i) = cone
                 if( i < q ) then
                    call stdlib${ii}$_zlarf( 'L', q-i, p-i, x11(i+1,i), 1_${ik}$,conjg(tauq1(i)), x11(i+1,i+1),&
                               ldx11, work )
                    call stdlib${ii}$_zlarf( 'L', q-i, m-p-i, x11(i+1,i), 1_${ik}$,conjg(tauq1(i)), x21(i+1,i+&
                              1_${ik}$), ldx21, work )
                 end if
                 call stdlib${ii}$_zlarf( 'L', m-q-i+1, p-i, x12(i,i), 1_${ik}$,conjg(tauq2(i)), x12(i,i+1), &
                           ldx12, work )
                 if ( m-p > i ) then
                    call stdlib${ii}$_zlarf( 'L', m-q-i+1, m-p-i, x12(i,i), 1_${ik}$,conjg(tauq2(i)), x22(i,i+&
                              1_${ik}$), ldx22, work )
                 end if
              end do
              ! reduce columns q + 1, ..., p of x12, x22
              do i = q + 1, p
                 call stdlib${ii}$_zscal( m-q-i+1, cmplx( -z1*z4, 0.0_dp,KIND=dp), x12(i,i), 1_${ik}$ )
                 call stdlib${ii}$_zlarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1_${ik}$, tauq2(i) )
                 x12(i,i) = cone
                 if ( p > i ) then
                    call stdlib${ii}$_zlarf( 'L', m-q-i+1, p-i, x12(i,i), 1_${ik}$,conjg(tauq2(i)), x12(i,i+1),&
                               ldx12, work )
                 end if
                 if( m-p-q >= 1_${ik}$ )call stdlib${ii}$_zlarf( 'L', m-q-i+1, m-p-q, x12(i,i), 1_${ik}$,conjg(tauq2(&
                           i)), x22(i,q+1), ldx22, work )
              end do
              ! reduce columns p + 1, ..., m - q of x12, x22
              do i = 1, m - p - q
                 call stdlib${ii}$_zscal( m-p-q-i+1, cmplx( z2*z4, 0.0_dp,KIND=dp),x22(p+i,q+i), 1_${ik}$ )
                           
                 call stdlib${ii}$_zlarfgp( m-p-q-i+1, x22(p+i,q+i), x22(p+i+1,q+i), 1_${ik}$,tauq2(p+i) )
                           
                 x22(p+i,q+i) = cone
                 if ( m-p-q /= i ) then
                    call stdlib${ii}$_zlarf( 'L', m-p-q-i+1, m-p-q-i, x22(p+i,q+i), 1_${ik}$,conjg(tauq2(p+i)),&
                               x22(p+i,q+i+1), ldx22,work )
                 end if
              end do
           end if
           return
     end subroutine stdlib${ii}$_zunbdb

#:for ck,ct,ci in CMPLX_KINDS_TYPES
#:if not ck in ["sp","dp"]
     module subroutine stdlib${ii}$_${ci}$unbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
     !! ZUNBDB: simultaneously bidiagonalizes the blocks of an M-by-M
     !! partitioned unitary matrix X:
     !! [ B11 | B12 0  0 ]
     !! [ X11 | X12 ]   [ P1 |    ] [  0  |  0 -I  0 ] [ Q1 |    ]**H
     !! X = [-----------] = [---------] [----------------] [---------]   .
     !! [ X21 | X22 ]   [    | P2 ] [ B21 | B22 0  0 ] [    | Q2 ]
     !! [  0  |  0  0  I ]
     !! X11 is P-by-Q. Q must be no larger than P, M-P, or M-Q. (If this is
     !! not the case, then X must be transposed and/or permuted. This can be
     !! done in constant time using the TRANS and SIGNS options. See ZUNCSD
     !! for details.)
     !! The unitary matrices P1, P2, Q1, and Q2 are P-by-P, (M-P)-by-
     !! (M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. They are
     !! represented implicitly by Householder vectors.
     !! B11, B12, B21, and B22 are Q-by-Q bidiagonal matrices represented
     !! implicitly by angles THETA, PHI.
               ldx22, theta, phi, taup1,taup2, tauq1, tauq2, work, lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${ck}$, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: signs, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldx11, ldx12, ldx21, ldx22, lwork, m, p, q
           ! Array Arguments 
           real(${ck}$), intent(out) :: phi(*), theta(*)
           complex(${ck}$), intent(out) :: taup1(*), taup2(*), tauq1(*), tauq2(*), work(*)
           complex(${ck}$), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
                     
        ! ====================================================================
           ! Parameters 
           
           
           
           ! Local Scalars 
           logical(lk) :: colmajor, lquery
           integer(${ik}$) :: i, lworkmin, lworkopt
           real(${ck}$) :: z1, z2, z3, z4
           ! Intrinsic Functions
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           colmajor = .not. stdlib_lsame( trans, 'T' )
           if( .not. stdlib_lsame( signs, 'O' ) ) then
              z1 = one
              z2 = one
              z3 = one
              z4 = one
           else
              z1 = one
              z2 = -one
              z3 = one
              z4 = -one
           end if
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -3_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -4_${ik}$
           else if( q < 0_${ik}$ .or. q > p .or. q > m-p .or.q > m-q ) then
              info = -5_${ik}$
           else if( colmajor .and. ldx11 < max( 1_${ik}$, p ) ) then
              info = -7_${ik}$
           else if( .not.colmajor .and. ldx11 < max( 1_${ik}$, q ) ) then
              info = -7_${ik}$
           else if( colmajor .and. ldx12 < max( 1_${ik}$, p ) ) then
              info = -9_${ik}$
           else if( .not.colmajor .and. ldx12 < max( 1_${ik}$, m-q ) ) then
              info = -9_${ik}$
           else if( colmajor .and. ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -11_${ik}$
           else if( .not.colmajor .and. ldx21 < max( 1_${ik}$, q ) ) then
              info = -11_${ik}$
           else if( colmajor .and. ldx22 < max( 1_${ik}$, m-p ) ) then
              info = -13_${ik}$
           else if( .not.colmajor .and. ldx22 < max( 1_${ik}$, m-q ) ) then
              info = -13_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              lworkopt = m - q
              lworkmin = m - q
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not. lquery ) then
                 info = -21_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'XORBDB', -info )
              return
           else if( lquery ) then
              return
           end if
           ! handle column-major and row-major separately
           if( colmajor ) then
              ! reduce columns 1, ..., q of x11, x12, x21, and x22
              do i = 1, q
                 if( i == 1_${ik}$ ) then
                    call stdlib${ii}$_${ci}$scal( p-i+1, cmplx( z1, 0.0_${ck}$,KIND=${ck}$), x11(i,i), 1_${ik}$ )
                 else
                    call stdlib${ii}$_${ci}$scal( p-i+1, cmplx( z1*cos(phi(i-1)), 0.0_${ck}$,KIND=${ck}$),x11(i,i), &
                              1_${ik}$ )
                    call stdlib${ii}$_${ci}$axpy( p-i+1, cmplx( -z1*z3*z4*sin(phi(i-1)),0.0_${ck}$,KIND=${ck}$), x12(&
                              i,i-1), 1_${ik}$, x11(i,i), 1_${ik}$ )
                 end if
                 if( i == 1_${ik}$ ) then
                    call stdlib${ii}$_${ci}$scal( m-p-i+1, cmplx( z2, 0.0_${ck}$,KIND=${ck}$), x21(i,i), 1_${ik}$ )
                 else
                    call stdlib${ii}$_${ci}$scal( m-p-i+1, cmplx( z2*cos(phi(i-1)), 0.0_${ck}$,KIND=${ck}$),x21(i,i),&
                               1_${ik}$ )
                    call stdlib${ii}$_${ci}$axpy( m-p-i+1, cmplx( -z2*z3*z4*sin(phi(i-1)),0.0_${ck}$,KIND=${ck}$), &
                              x22(i,i-1), 1_${ik}$, x21(i,i), 1_${ik}$ )
                 end if
                 theta(i) = atan2( stdlib${ii}$_${c2ri(ci)}$znrm2( m-p-i+1, x21(i,i), 1_${ik}$ ),stdlib${ii}$_${c2ri(ci)}$znrm2( p-i+1, &
                           x11(i,i), 1_${ik}$ ) )
                 if( p > i ) then
                    call stdlib${ii}$_${ci}$larfgp( p-i+1, x11(i,i), x11(i+1,i), 1_${ik}$, taup1(i) )
                 else if ( p == i ) then
                    call stdlib${ii}$_${ci}$larfgp( p-i+1, x11(i,i), x11(i,i), 1_${ik}$, taup1(i) )
                 end if
                 x11(i,i) = cone
                 if ( m-p > i ) then
                    call stdlib${ii}$_${ci}$larfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1_${ik}$,taup2(i) )
                 else if ( m-p == i ) then
                    call stdlib${ii}$_${ci}$larfgp( m-p-i+1, x21(i,i), x21(i,i), 1_${ik}$,taup2(i) )
                 end if
                 x21(i,i) = cone
                 if ( q > i ) then
                    call stdlib${ii}$_${ci}$larf( 'L', p-i+1, q-i, x11(i,i), 1_${ik}$,conjg(taup1(i)), x11(i,i+1), &
                              ldx11, work )
                    call stdlib${ii}$_${ci}$larf( 'L', m-p-i+1, q-i, x21(i,i), 1_${ik}$,conjg(taup2(i)), x21(i,i+1),&
                               ldx21, work )
                 end if
                 if ( m-q+1 > i ) then
                    call stdlib${ii}$_${ci}$larf( 'L', p-i+1, m-q-i+1, x11(i,i), 1_${ik}$,conjg(taup1(i)), x12(i,i),&
                               ldx12, work )
                    call stdlib${ii}$_${ci}$larf( 'L', m-p-i+1, m-q-i+1, x21(i,i), 1_${ik}$,conjg(taup2(i)), x22(i,&
                              i), ldx22, work )
                 end if
                 if( i < q ) then
                    call stdlib${ii}$_${ci}$scal( q-i, cmplx( -z1*z3*sin(theta(i)), 0.0_${ck}$,KIND=${ck}$),x11(i,i+&
                              1_${ik}$), ldx11 )
                    call stdlib${ii}$_${ci}$axpy( q-i, cmplx( z2*z3*cos(theta(i)), 0.0_${ck}$,KIND=${ck}$),x21(i,i+1)&
                              , ldx21, x11(i,i+1), ldx11 )
                 end if
                 call stdlib${ii}$_${ci}$scal( m-q-i+1, cmplx( -z1*z4*sin(theta(i)), 0.0_${ck}$,KIND=${ck}$),x12(i,i)&
                           , ldx12 )
                 call stdlib${ii}$_${ci}$axpy( m-q-i+1, cmplx( z2*z4*cos(theta(i)), 0.0_${ck}$,KIND=${ck}$),x22(i,i),&
                            ldx22, x12(i,i), ldx12 )
                 if( i < q )phi(i) = atan2( stdlib${ii}$_${c2ri(ci)}$znrm2( q-i, x11(i,i+1), ldx11 ),stdlib${ii}$_${c2ri(ci)}$znrm2(&
                            m-q-i+1, x12(i,i), ldx12 ) )
                 if( i < q ) then
                    call stdlib${ii}$_${ci}$lacgv( q-i, x11(i,i+1), ldx11 )
                    if ( i == q-1 ) then
                       call stdlib${ii}$_${ci}$larfgp( q-i, x11(i,i+1), x11(i,i+1), ldx11,tauq1(i) )
                    else
                       call stdlib${ii}$_${ci}$larfgp( q-i, x11(i,i+1), x11(i,i+2), ldx11,tauq1(i) )
                    end if
                    x11(i,i+1) = cone
                 end if
                 if ( m-q+1 > i ) then
                    call stdlib${ii}$_${ci}$lacgv( m-q-i+1, x12(i,i), ldx12 )
                    if ( m-q == i ) then
                       call stdlib${ii}$_${ci}$larfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
                    else
                       call stdlib${ii}$_${ci}$larfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
                                 
                    end if
                 end if
                 x12(i,i) = cone
                 if( i < q ) then
                    call stdlib${ii}$_${ci}$larf( 'R', p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x11(i+1,i+1), &
                              ldx11, work )
                    call stdlib${ii}$_${ci}$larf( 'R', m-p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x21(i+1,i+1), &
                              ldx21, work )
                 end if
                 if ( p > i ) then
                    call stdlib${ii}$_${ci}$larf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
                              ldx12, work )
                 end if
                 if ( m-p > i ) then
                    call stdlib${ii}$_${ci}$larf( 'R', m-p-i, m-q-i+1, x12(i,i), ldx12,tauq2(i), x22(i+1,i), &
                              ldx22, work )
                 end if
                 if( i < q )call stdlib${ii}$_${ci}$lacgv( q-i, x11(i,i+1), ldx11 )
                 call stdlib${ii}$_${ci}$lacgv( m-q-i+1, x12(i,i), ldx12 )
              end do
              ! reduce columns q + 1, ..., p of x12, x22
              do i = q + 1, p
                 call stdlib${ii}$_${ci}$scal( m-q-i+1, cmplx( -z1*z4, 0.0_${ck}$,KIND=${ck}$), x12(i,i),ldx12 )
                           
                 call stdlib${ii}$_${ci}$lacgv( m-q-i+1, x12(i,i), ldx12 )
                 if ( i >= m-q ) then
                    call stdlib${ii}$_${ci}$larfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
                 else
                    call stdlib${ii}$_${ci}$larfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
                 end if
                 x12(i,i) = cone
                 if ( p > i ) then
                    call stdlib${ii}$_${ci}$larf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
                              ldx12, work )
                 end if
                 if( m-p-q >= 1_${ik}$ )call stdlib${ii}$_${ci}$larf( 'R', m-p-q, m-q-i+1, x12(i,i), ldx12,tauq2(i),&
                            x22(q+1,i), ldx22, work )
                 call stdlib${ii}$_${ci}$lacgv( m-q-i+1, x12(i,i), ldx12 )
              end do
              ! reduce columns p + 1, ..., m - q of x12, x22
              do i = 1, m - p - q
                 call stdlib${ii}$_${ci}$scal( m-p-q-i+1, cmplx( z2*z4, 0.0_${ck}$,KIND=${ck}$),x22(q+i,p+i), ldx22 )
                           
                 call stdlib${ii}$_${ci}$lacgv( m-p-q-i+1, x22(q+i,p+i), ldx22 )
                 call stdlib${ii}$_${ci}$larfgp( m-p-q-i+1, x22(q+i,p+i), x22(q+i,p+i+1),ldx22, tauq2(p+i) )
                           
                 x22(q+i,p+i) = cone
                 call stdlib${ii}$_${ci}$larf( 'R', m-p-q-i, m-p-q-i+1, x22(q+i,p+i), ldx22,tauq2(p+i), x22(&
                           q+i+1,p+i), ldx22, work )
                 call stdlib${ii}$_${ci}$lacgv( m-p-q-i+1, x22(q+i,p+i), ldx22 )
              end do
           else
              ! reduce columns 1, ..., q of x11, x12, x21, x22
              do i = 1, q
                 if( i == 1_${ik}$ ) then
                    call stdlib${ii}$_${ci}$scal( p-i+1, cmplx( z1, 0.0_${ck}$,KIND=${ck}$), x11(i,i),ldx11 )
                 else
                    call stdlib${ii}$_${ci}$scal( p-i+1, cmplx( z1*cos(phi(i-1)), 0.0_${ck}$,KIND=${ck}$),x11(i,i), &
                              ldx11 )
                    call stdlib${ii}$_${ci}$axpy( p-i+1, cmplx( -z1*z3*z4*sin(phi(i-1)),0.0_${ck}$,KIND=${ck}$), x12(&
                              i-1,i), ldx12, x11(i,i), ldx11 )
                 end if
                 if( i == 1_${ik}$ ) then
                    call stdlib${ii}$_${ci}$scal( m-p-i+1, cmplx( z2, 0.0_${ck}$,KIND=${ck}$), x21(i,i),ldx21 )
                              
                 else
                    call stdlib${ii}$_${ci}$scal( m-p-i+1, cmplx( z2*cos(phi(i-1)), 0.0_${ck}$,KIND=${ck}$),x21(i,i),&
                               ldx21 )
                    call stdlib${ii}$_${ci}$axpy( m-p-i+1, cmplx( -z2*z3*z4*sin(phi(i-1)),0.0_${ck}$,KIND=${ck}$), &
                              x22(i-1,i), ldx22, x21(i,i), ldx21 )
                 end if
                 theta(i) = atan2( stdlib${ii}$_${c2ri(ci)}$znrm2( m-p-i+1, x21(i,i), ldx21 ),stdlib${ii}$_${c2ri(ci)}$znrm2( p-i+1,&
                            x11(i,i), ldx11 ) )
                 call stdlib${ii}$_${ci}$lacgv( p-i+1, x11(i,i), ldx11 )
                 call stdlib${ii}$_${ci}$lacgv( m-p-i+1, x21(i,i), ldx21 )
                 call stdlib${ii}$_${ci}$larfgp( p-i+1, x11(i,i), x11(i,i+1), ldx11, taup1(i) )
                 x11(i,i) = cone
                 if ( i == m-p ) then
                    call stdlib${ii}$_${ci}$larfgp( m-p-i+1, x21(i,i), x21(i,i), ldx21,taup2(i) )
                 else
                    call stdlib${ii}$_${ci}$larfgp( m-p-i+1, x21(i,i), x21(i,i+1), ldx21,taup2(i) )
                 end if
                 x21(i,i) = cone
                 call stdlib${ii}$_${ci}$larf( 'R', q-i, p-i+1, x11(i,i), ldx11, taup1(i),x11(i+1,i), ldx11, &
                           work )
                 call stdlib${ii}$_${ci}$larf( 'R', m-q-i+1, p-i+1, x11(i,i), ldx11, taup1(i),x12(i,i), &
                           ldx12, work )
                 call stdlib${ii}$_${ci}$larf( 'R', q-i, m-p-i+1, x21(i,i), ldx21, taup2(i),x21(i+1,i), &
                           ldx21, work )
                 call stdlib${ii}$_${ci}$larf( 'R', m-q-i+1, m-p-i+1, x21(i,i), ldx21,taup2(i), x22(i,i), &
                           ldx22, work )
                 call stdlib${ii}$_${ci}$lacgv( p-i+1, x11(i,i), ldx11 )
                 call stdlib${ii}$_${ci}$lacgv( m-p-i+1, x21(i,i), ldx21 )
                 if( i < q ) then
                    call stdlib${ii}$_${ci}$scal( q-i, cmplx( -z1*z3*sin(theta(i)), 0.0_${ck}$,KIND=${ck}$),x11(i+1,&
                              i), 1_${ik}$ )
                    call stdlib${ii}$_${ci}$axpy( q-i, cmplx( z2*z3*cos(theta(i)), 0.0_${ck}$,KIND=${ck}$),x21(i+1,i)&
                              , 1_${ik}$, x11(i+1,i), 1_${ik}$ )
                 end if
                 call stdlib${ii}$_${ci}$scal( m-q-i+1, cmplx( -z1*z4*sin(theta(i)), 0.0_${ck}$,KIND=${ck}$),x12(i,i)&
                           , 1_${ik}$ )
                 call stdlib${ii}$_${ci}$axpy( m-q-i+1, cmplx( z2*z4*cos(theta(i)), 0.0_${ck}$,KIND=${ck}$),x22(i,i),&
                            1_${ik}$, x12(i,i), 1_${ik}$ )
                 if( i < q )phi(i) = atan2( stdlib${ii}$_${c2ri(ci)}$znrm2( q-i, x11(i+1,i), 1_${ik}$ ),stdlib${ii}$_${c2ri(ci)}$znrm2( m-&
                           q-i+1, x12(i,i), 1_${ik}$ ) )
                 if( i < q ) then
                    call stdlib${ii}$_${ci}$larfgp( q-i, x11(i+1,i), x11(i+2,i), 1_${ik}$, tauq1(i) )
                    x11(i+1,i) = cone
                 end if
                 call stdlib${ii}$_${ci}$larfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1_${ik}$, tauq2(i) )
                 x12(i,i) = cone
                 if( i < q ) then
                    call stdlib${ii}$_${ci}$larf( 'L', q-i, p-i, x11(i+1,i), 1_${ik}$,conjg(tauq1(i)), x11(i+1,i+1),&
                               ldx11, work )
                    call stdlib${ii}$_${ci}$larf( 'L', q-i, m-p-i, x11(i+1,i), 1_${ik}$,conjg(tauq1(i)), x21(i+1,i+&
                              1_${ik}$), ldx21, work )
                 end if
                 call stdlib${ii}$_${ci}$larf( 'L', m-q-i+1, p-i, x12(i,i), 1_${ik}$,conjg(tauq2(i)), x12(i,i+1), &
                           ldx12, work )
                 if ( m-p > i ) then
                    call stdlib${ii}$_${ci}$larf( 'L', m-q-i+1, m-p-i, x12(i,i), 1_${ik}$,conjg(tauq2(i)), x22(i,i+&
                              1_${ik}$), ldx22, work )
                 end if
              end do
              ! reduce columns q + 1, ..., p of x12, x22
              do i = q + 1, p
                 call stdlib${ii}$_${ci}$scal( m-q-i+1, cmplx( -z1*z4, 0.0_${ck}$,KIND=${ck}$), x12(i,i), 1_${ik}$ )
                 call stdlib${ii}$_${ci}$larfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1_${ik}$, tauq2(i) )
                 x12(i,i) = cone
                 if ( p > i ) then
                    call stdlib${ii}$_${ci}$larf( 'L', m-q-i+1, p-i, x12(i,i), 1_${ik}$,conjg(tauq2(i)), x12(i,i+1),&
                               ldx12, work )
                 end if
                 if( m-p-q >= 1_${ik}$ )call stdlib${ii}$_${ci}$larf( 'L', m-q-i+1, m-p-q, x12(i,i), 1_${ik}$,conjg(tauq2(&
                           i)), x22(i,q+1), ldx22, work )
              end do
              ! reduce columns p + 1, ..., m - q of x12, x22
              do i = 1, m - p - q
                 call stdlib${ii}$_${ci}$scal( m-p-q-i+1, cmplx( z2*z4, 0.0_${ck}$,KIND=${ck}$),x22(p+i,q+i), 1_${ik}$ )
                           
                 call stdlib${ii}$_${ci}$larfgp( m-p-q-i+1, x22(p+i,q+i), x22(p+i+1,q+i), 1_${ik}$,tauq2(p+i) )
                           
                 x22(p+i,q+i) = cone
                 if ( m-p-q /= i ) then
                    call stdlib${ii}$_${ci}$larf( 'L', m-p-q-i+1, m-p-q-i, x22(p+i,q+i), 1_${ik}$,conjg(tauq2(p+i)),&
                               x22(p+i,q+i+1), ldx22,work )
                 end if
              end do
           end if
           return
     end subroutine stdlib${ii}$_${ci}$unbdb

#:endif
#:endfor



     module subroutine stdlib${ii}$_cunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
     !! CUNBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny
     !! matrix X with orthonomal columns:
     !! [ B11 ]
     !! [ X11 ]   [ P1 |    ] [  0  ]
     !! [-----] = [---------] [-----] Q1**T .
     !! [ X21 ]   [    | P2 ] [ B21 ]
     !! [  0  ]
     !! X11 is P-by-Q, and X21 is (M-P)-by-Q. Q must be no larger than P,
     !! M-P, or M-Q. Routines CUNBDB2, CUNBDB3, and CUNBDB4 handle cases in
     !! which Q is not the minimum dimension.
     !! The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
     !! and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
     !! Householder vectors.
     !! B11 and B12 are Q-by-Q bidiagonal matrices represented implicitly by
     !! angles THETA, PHI.
               work, lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lwork, m, p, q, ldx11, ldx21
           ! Array Arguments 
           real(sp), intent(out) :: phi(*), theta(*)
           complex(sp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
           complex(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
        ! ====================================================================
           
           ! Local Scalars 
           real(sp) :: c, s
           integer(${ik}$) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
                     lworkopt
           logical(lk) :: lquery
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( p < q .or. m-p < q ) then
              info = -2_${ik}$
           else if( q < 0_${ik}$ .or. m-q < q ) then
              info = -3_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -5_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -7_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ilarf = 2_${ik}$
              llarf = max( p-1, m-p-1, q-1 )
              iorbdb5 = 2_${ik}$
              lorbdb5 = q-2
              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                info = -14_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'CUNBDB1', -info )
              return
           else if( lquery ) then
              return
           end if
           ! reduce columns 1, ..., q of x11 and x21
           do i = 1, q
              call stdlib${ii}$_clarfgp( p-i+1, x11(i,i), x11(i+1,i), 1_${ik}$, taup1(i) )
              call stdlib${ii}$_clarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1_${ik}$, taup2(i) )
              theta(i) = atan2( real( x21(i,i),KIND=sp), real( x11(i,i),KIND=sp) )
              c = cos( theta(i) )
              s = sin( theta(i) )
              x11(i,i) = cone
              x21(i,i) = cone
              call stdlib${ii}$_clarf( 'L', p-i+1, q-i, x11(i,i), 1_${ik}$, conjg(taup1(i)),x11(i,i+1), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_clarf( 'L', m-p-i+1, q-i, x21(i,i), 1_${ik}$, conjg(taup2(i)),x21(i,i+1), &
                        ldx21, work(ilarf) )
              if( i < q ) then
                 call stdlib${ii}$_csrot( q-i, x11(i,i+1), ldx11, x21(i,i+1), ldx21, c,s )
                 call stdlib${ii}$_clacgv( q-i, x21(i,i+1), ldx21 )
                 call stdlib${ii}$_clarfgp( q-i, x21(i,i+1), x21(i,i+2), ldx21, tauq1(i) )
                 s = real( x21(i,i+1),KIND=sp)
                 x21(i,i+1) = cone
                 call stdlib${ii}$_clarf( 'R', p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x11(i+1,i+1), &
                           ldx11, work(ilarf) )
                 call stdlib${ii}$_clarf( 'R', m-p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x21(i+1,i+1), &
                           ldx21, work(ilarf) )
                 call stdlib${ii}$_clacgv( q-i, x21(i,i+1), ldx21 )
                 c = sqrt( stdlib${ii}$_scnrm2( p-i, x11(i+1,i+1), 1_${ik}$ )**2_${ik}$+ stdlib${ii}$_scnrm2( m-p-i, x21(i+&
                           1_${ik}$,i+1), 1_${ik}$ )**2_${ik}$ )
                 phi(i) = atan2( s, c )
                 call stdlib${ii}$_cunbdb5( p-i, m-p-i, q-i-1, x11(i+1,i+1), 1_${ik}$,x21(i+1,i+1), 1_${ik}$, x11(i+1,&
                           i+2), ldx11,x21(i+1,i+2), ldx21, work(iorbdb5), lorbdb5,childinfo )
              end if
           end do
           return
     end subroutine stdlib${ii}$_cunbdb1

     module subroutine stdlib${ii}$_zunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
     !! ZUNBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny
     !! matrix X with orthonomal columns:
     !! [ B11 ]
     !! [ X11 ]   [ P1 |    ] [  0  ]
     !! [-----] = [---------] [-----] Q1**T .
     !! [ X21 ]   [    | P2 ] [ B21 ]
     !! [  0  ]
     !! X11 is P-by-Q, and X21 is (M-P)-by-Q. Q must be no larger than P,
     !! M-P, or M-Q. Routines ZUNBDB2, ZUNBDB3, and ZUNBDB4 handle cases in
     !! which Q is not the minimum dimension.
     !! The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
     !! and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
     !! Householder vectors.
     !! B11 and B12 are Q-by-Q bidiagonal matrices represented implicitly by
     !! angles THETA, PHI.
               work, lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lwork, m, p, q, ldx11, ldx21
           ! Array Arguments 
           real(dp), intent(out) :: phi(*), theta(*)
           complex(dp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
           complex(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
        ! ====================================================================
           
           ! Local Scalars 
           real(dp) :: c, s
           integer(${ik}$) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
                     lworkopt
           logical(lk) :: lquery
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( p < q .or. m-p < q ) then
              info = -2_${ik}$
           else if( q < 0_${ik}$ .or. m-q < q ) then
              info = -3_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -5_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -7_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ilarf = 2_${ik}$
              llarf = max( p-1, m-p-1, q-1 )
              iorbdb5 = 2_${ik}$
              lorbdb5 = q-2
              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                info = -14_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNBDB1', -info )
              return
           else if( lquery ) then
              return
           end if
           ! reduce columns 1, ..., q of x11 and x21
           do i = 1, q
              call stdlib${ii}$_zlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1_${ik}$, taup1(i) )
              call stdlib${ii}$_zlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1_${ik}$, taup2(i) )
              theta(i) = atan2( real( x21(i,i),KIND=dp), real( x11(i,i),KIND=dp) )
              c = cos( theta(i) )
              s = sin( theta(i) )
              x11(i,i) = cone
              x21(i,i) = cone
              call stdlib${ii}$_zlarf( 'L', p-i+1, q-i, x11(i,i), 1_${ik}$, conjg(taup1(i)),x11(i,i+1), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_zlarf( 'L', m-p-i+1, q-i, x21(i,i), 1_${ik}$, conjg(taup2(i)),x21(i,i+1), &
                        ldx21, work(ilarf) )
              if( i < q ) then
                 call stdlib${ii}$_zdrot( q-i, x11(i,i+1), ldx11, x21(i,i+1), ldx21, c,s )
                 call stdlib${ii}$_zlacgv( q-i, x21(i,i+1), ldx21 )
                 call stdlib${ii}$_zlarfgp( q-i, x21(i,i+1), x21(i,i+2), ldx21, tauq1(i) )
                 s = real( x21(i,i+1),KIND=dp)
                 x21(i,i+1) = cone
                 call stdlib${ii}$_zlarf( 'R', p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x11(i+1,i+1), &
                           ldx11, work(ilarf) )
                 call stdlib${ii}$_zlarf( 'R', m-p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x21(i+1,i+1), &
                           ldx21, work(ilarf) )
                 call stdlib${ii}$_zlacgv( q-i, x21(i,i+1), ldx21 )
                 c = sqrt( stdlib${ii}$_dznrm2( p-i, x11(i+1,i+1), 1_${ik}$ )**2_${ik}$+ stdlib${ii}$_dznrm2( m-p-i, x21(i+&
                           1_${ik}$,i+1), 1_${ik}$ )**2_${ik}$ )
                 phi(i) = atan2( s, c )
                 call stdlib${ii}$_zunbdb5( p-i, m-p-i, q-i-1, x11(i+1,i+1), 1_${ik}$,x21(i+1,i+1), 1_${ik}$, x11(i+1,&
                           i+2), ldx11,x21(i+1,i+2), ldx21, work(iorbdb5), lorbdb5,childinfo )
              end if
           end do
           return
     end subroutine stdlib${ii}$_zunbdb1

#:for ck,ct,ci in CMPLX_KINDS_TYPES
#:if not ck in ["sp","dp"]
     module subroutine stdlib${ii}$_${ci}$unbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
     !! ZUNBDB1: simultaneously bidiagonalizes the blocks of a tall and skinny
     !! matrix X with orthonomal columns:
     !! [ B11 ]
     !! [ X11 ]   [ P1 |    ] [  0  ]
     !! [-----] = [---------] [-----] Q1**T .
     !! [ X21 ]   [    | P2 ] [ B21 ]
     !! [  0  ]
     !! X11 is P-by-Q, and X21 is (M-P)-by-Q. Q must be no larger than P,
     !! M-P, or M-Q. Routines ZUNBDB2, ZUNBDB3, and ZUNBDB4 handle cases in
     !! which Q is not the minimum dimension.
     !! The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
     !! and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
     !! Householder vectors.
     !! B11 and B12 are Q-by-Q bidiagonal matrices represented implicitly by
     !! angles THETA, PHI.
               work, lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${ck}$, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lwork, m, p, q, ldx11, ldx21
           ! Array Arguments 
           real(${ck}$), intent(out) :: phi(*), theta(*)
           complex(${ck}$), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
           complex(${ck}$), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
        ! ====================================================================
           
           ! Local Scalars 
           real(${ck}$) :: c, s
           integer(${ik}$) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
                     lworkopt
           logical(lk) :: lquery
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( p < q .or. m-p < q ) then
              info = -2_${ik}$
           else if( q < 0_${ik}$ .or. m-q < q ) then
              info = -3_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -5_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -7_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ilarf = 2_${ik}$
              llarf = max( p-1, m-p-1, q-1 )
              iorbdb5 = 2_${ik}$
              lorbdb5 = q-2
              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                info = -14_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNBDB1', -info )
              return
           else if( lquery ) then
              return
           end if
           ! reduce columns 1, ..., q of x11 and x21
           do i = 1, q
              call stdlib${ii}$_${ci}$larfgp( p-i+1, x11(i,i), x11(i+1,i), 1_${ik}$, taup1(i) )
              call stdlib${ii}$_${ci}$larfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1_${ik}$, taup2(i) )
              theta(i) = atan2( real( x21(i,i),KIND=${ck}$), real( x11(i,i),KIND=${ck}$) )
              c = cos( theta(i) )
              s = sin( theta(i) )
              x11(i,i) = cone
              x21(i,i) = cone
              call stdlib${ii}$_${ci}$larf( 'L', p-i+1, q-i, x11(i,i), 1_${ik}$, conjg(taup1(i)),x11(i,i+1), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_${ci}$larf( 'L', m-p-i+1, q-i, x21(i,i), 1_${ik}$, conjg(taup2(i)),x21(i,i+1), &
                        ldx21, work(ilarf) )
              if( i < q ) then
                 call stdlib${ii}$_${ci}$drot( q-i, x11(i,i+1), ldx11, x21(i,i+1), ldx21, c,s )
                 call stdlib${ii}$_${ci}$lacgv( q-i, x21(i,i+1), ldx21 )
                 call stdlib${ii}$_${ci}$larfgp( q-i, x21(i,i+1), x21(i,i+2), ldx21, tauq1(i) )
                 s = real( x21(i,i+1),KIND=${ck}$)
                 x21(i,i+1) = cone
                 call stdlib${ii}$_${ci}$larf( 'R', p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x11(i+1,i+1), &
                           ldx11, work(ilarf) )
                 call stdlib${ii}$_${ci}$larf( 'R', m-p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x21(i+1,i+1), &
                           ldx21, work(ilarf) )
                 call stdlib${ii}$_${ci}$lacgv( q-i, x21(i,i+1), ldx21 )
                 c = sqrt( stdlib${ii}$_${c2ri(ci)}$znrm2( p-i, x11(i+1,i+1), 1_${ik}$ )**2_${ik}$+ stdlib${ii}$_${c2ri(ci)}$znrm2( m-p-i, x21(i+&
                           1_${ik}$,i+1), 1_${ik}$ )**2_${ik}$ )
                 phi(i) = atan2( s, c )
                 call stdlib${ii}$_${ci}$unbdb5( p-i, m-p-i, q-i-1, x11(i+1,i+1), 1_${ik}$,x21(i+1,i+1), 1_${ik}$, x11(i+1,&
                           i+2), ldx11,x21(i+1,i+2), ldx21, work(iorbdb5), lorbdb5,childinfo )
              end if
           end do
           return
     end subroutine stdlib${ii}$_${ci}$unbdb1

#:endif
#:endfor



     module subroutine stdlib${ii}$_cunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
     !! CUNBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny
     !! matrix X with orthonomal columns:
     !! [ B11 ]
     !! [ X11 ]   [ P1 |    ] [  0  ]
     !! [-----] = [---------] [-----] Q1**T .
     !! [ X21 ]   [    | P2 ] [ B21 ]
     !! [  0  ]
     !! X11 is P-by-Q, and X21 is (M-P)-by-Q. P must be no larger than M-P,
     !! Q, or M-Q. Routines CUNBDB1, CUNBDB3, and CUNBDB4 handle cases in
     !! which P is not the minimum dimension.
     !! The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
     !! and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
     !! Householder vectors.
     !! B11 and B12 are P-by-P bidiagonal matrices represented implicitly by
     !! angles THETA, PHI.
               work, lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lwork, m, p, q, ldx11, ldx21
           ! Array Arguments 
           real(sp), intent(out) :: phi(*), theta(*)
           complex(sp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
           complex(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
        ! ====================================================================
           
           ! Local Scalars 
           real(sp) :: c, s
           integer(${ik}$) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
                     lworkopt
           logical(lk) :: lquery
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( p < 0_${ik}$ .or. p > m-p ) then
              info = -2_${ik}$
           else if( q < 0_${ik}$ .or. q < p .or. m-q < p ) then
              info = -3_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -5_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -7_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ilarf = 2_${ik}$
              llarf = max( p-1, m-p, q-1 )
              iorbdb5 = 2_${ik}$
              lorbdb5 = q-1
              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                info = -14_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'CUNBDB2', -info )
              return
           else if( lquery ) then
              return
           end if
           ! reduce rows 1, ..., p of x11 and x21
           do i = 1, p
              if( i > 1_${ik}$ ) then
                 call stdlib${ii}$_csrot( q-i+1, x11(i,i), ldx11, x21(i-1,i), ldx21, c,s )
              end if
              call stdlib${ii}$_clacgv( q-i+1, x11(i,i), ldx11 )
              call stdlib${ii}$_clarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
              c = real( x11(i,i),KIND=sp)
              x11(i,i) = cone
              call stdlib${ii}$_clarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_clarf( 'R', m-p-i+1, q-i+1, x11(i,i), ldx11, tauq1(i),x21(i,i), ldx21, &
                        work(ilarf) )
              call stdlib${ii}$_clacgv( q-i+1, x11(i,i), ldx11 )
              s = sqrt( stdlib${ii}$_scnrm2( p-i, x11(i+1,i), 1_${ik}$ )**2_${ik}$+ stdlib${ii}$_scnrm2( m-p-i+1, x21(i,i), &
                        1_${ik}$ )**2_${ik}$ )
              theta(i) = atan2( s, c )
              call stdlib${ii}$_cunbdb5( p-i, m-p-i+1, q-i, x11(i+1,i), 1_${ik}$, x21(i,i), 1_${ik}$,x11(i+1,i+1), &
                        ldx11, x21(i,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
              call stdlib${ii}$_cscal( p-i, cnegone, x11(i+1,i), 1_${ik}$ )
              call stdlib${ii}$_clarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1_${ik}$, taup2(i) )
              if( i < p ) then
                 call stdlib${ii}$_clarfgp( p-i, x11(i+1,i), x11(i+2,i), 1_${ik}$, taup1(i) )
                 phi(i) = atan2( real( x11(i+1,i),KIND=sp), real( x21(i,i),KIND=sp) )
                 c = cos( phi(i) )
                 s = sin( phi(i) )
                 x11(i+1,i) = cone
                 call stdlib${ii}$_clarf( 'L', p-i, q-i, x11(i+1,i), 1_${ik}$, conjg(taup1(i)),x11(i+1,i+1), &
                           ldx11, work(ilarf) )
              end if
              x21(i,i) = cone
              call stdlib${ii}$_clarf( 'L', m-p-i+1, q-i, x21(i,i), 1_${ik}$, conjg(taup2(i)),x21(i,i+1), &
                        ldx21, work(ilarf) )
           end do
           ! reduce the bottom-right portion of x21 to the identity matrix
           do i = p + 1, q
              call stdlib${ii}$_clarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1_${ik}$, taup2(i) )
              x21(i,i) = cone
              call stdlib${ii}$_clarf( 'L', m-p-i+1, q-i, x21(i,i), 1_${ik}$, conjg(taup2(i)),x21(i,i+1), &
                        ldx21, work(ilarf) )
           end do
           return
     end subroutine stdlib${ii}$_cunbdb2

     module subroutine stdlib${ii}$_zunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
     !! ZUNBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny
     !! matrix X with orthonomal columns:
     !! [ B11 ]
     !! [ X11 ]   [ P1 |    ] [  0  ]
     !! [-----] = [---------] [-----] Q1**T .
     !! [ X21 ]   [    | P2 ] [ B21 ]
     !! [  0  ]
     !! X11 is P-by-Q, and X21 is (M-P)-by-Q. P must be no larger than M-P,
     !! Q, or M-Q. Routines ZUNBDB1, ZUNBDB3, and ZUNBDB4 handle cases in
     !! which P is not the minimum dimension.
     !! The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
     !! and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
     !! Householder vectors.
     !! B11 and B12 are P-by-P bidiagonal matrices represented implicitly by
     !! angles THETA, PHI.
               work, lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lwork, m, p, q, ldx11, ldx21
           ! Array Arguments 
           real(dp), intent(out) :: phi(*), theta(*)
           complex(dp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
           complex(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
        ! ====================================================================
           
           ! Local Scalars 
           real(dp) :: c, s
           integer(${ik}$) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
                     lworkopt
           logical(lk) :: lquery
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( p < 0_${ik}$ .or. p > m-p ) then
              info = -2_${ik}$
           else if( q < 0_${ik}$ .or. q < p .or. m-q < p ) then
              info = -3_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -5_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -7_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ilarf = 2_${ik}$
              llarf = max( p-1, m-p, q-1 )
              iorbdb5 = 2_${ik}$
              lorbdb5 = q-1
              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                info = -14_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNBDB2', -info )
              return
           else if( lquery ) then
              return
           end if
           ! reduce rows 1, ..., p of x11 and x21
           do i = 1, p
              if( i > 1_${ik}$ ) then
                 call stdlib${ii}$_zdrot( q-i+1, x11(i,i), ldx11, x21(i-1,i), ldx21, c,s )
              end if
              call stdlib${ii}$_zlacgv( q-i+1, x11(i,i), ldx11 )
              call stdlib${ii}$_zlarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
              c = real( x11(i,i),KIND=dp)
              x11(i,i) = cone
              call stdlib${ii}$_zlarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_zlarf( 'R', m-p-i+1, q-i+1, x11(i,i), ldx11, tauq1(i),x21(i,i), ldx21, &
                        work(ilarf) )
              call stdlib${ii}$_zlacgv( q-i+1, x11(i,i), ldx11 )
              s = sqrt( stdlib${ii}$_dznrm2( p-i, x11(i+1,i), 1_${ik}$ )**2_${ik}$+ stdlib${ii}$_dznrm2( m-p-i+1, x21(i,i), &
                        1_${ik}$ )**2_${ik}$ )
              theta(i) = atan2( s, c )
              call stdlib${ii}$_zunbdb5( p-i, m-p-i+1, q-i, x11(i+1,i), 1_${ik}$, x21(i,i), 1_${ik}$,x11(i+1,i+1), &
                        ldx11, x21(i,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
              call stdlib${ii}$_zscal( p-i, cnegone, x11(i+1,i), 1_${ik}$ )
              call stdlib${ii}$_zlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1_${ik}$, taup2(i) )
              if( i < p ) then
                 call stdlib${ii}$_zlarfgp( p-i, x11(i+1,i), x11(i+2,i), 1_${ik}$, taup1(i) )
                 phi(i) = atan2( real( x11(i+1,i),KIND=dp), real( x21(i,i),KIND=dp) )
                 c = cos( phi(i) )
                 s = sin( phi(i) )
                 x11(i+1,i) = cone
                 call stdlib${ii}$_zlarf( 'L', p-i, q-i, x11(i+1,i), 1_${ik}$, conjg(taup1(i)),x11(i+1,i+1), &
                           ldx11, work(ilarf) )
              end if
              x21(i,i) = cone
              call stdlib${ii}$_zlarf( 'L', m-p-i+1, q-i, x21(i,i), 1_${ik}$, conjg(taup2(i)),x21(i,i+1), &
                        ldx21, work(ilarf) )
           end do
           ! reduce the bottom-right portion of x21 to the identity matrix
           do i = p + 1, q
              call stdlib${ii}$_zlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1_${ik}$, taup2(i) )
              x21(i,i) = cone
              call stdlib${ii}$_zlarf( 'L', m-p-i+1, q-i, x21(i,i), 1_${ik}$, conjg(taup2(i)),x21(i,i+1), &
                        ldx21, work(ilarf) )
           end do
           return
     end subroutine stdlib${ii}$_zunbdb2

#:for ck,ct,ci in CMPLX_KINDS_TYPES
#:if not ck in ["sp","dp"]
     module subroutine stdlib${ii}$_${ci}$unbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
     !! ZUNBDB2: simultaneously bidiagonalizes the blocks of a tall and skinny
     !! matrix X with orthonomal columns:
     !! [ B11 ]
     !! [ X11 ]   [ P1 |    ] [  0  ]
     !! [-----] = [---------] [-----] Q1**T .
     !! [ X21 ]   [    | P2 ] [ B21 ]
     !! [  0  ]
     !! X11 is P-by-Q, and X21 is (M-P)-by-Q. P must be no larger than M-P,
     !! Q, or M-Q. Routines ZUNBDB1, ZUNBDB3, and ZUNBDB4 handle cases in
     !! which P is not the minimum dimension.
     !! The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
     !! and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
     !! Householder vectors.
     !! B11 and B12 are P-by-P bidiagonal matrices represented implicitly by
     !! angles THETA, PHI.
               work, lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${ck}$, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lwork, m, p, q, ldx11, ldx21
           ! Array Arguments 
           real(${ck}$), intent(out) :: phi(*), theta(*)
           complex(${ck}$), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
           complex(${ck}$), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
        ! ====================================================================
           
           ! Local Scalars 
           real(${ck}$) :: c, s
           integer(${ik}$) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
                     lworkopt
           logical(lk) :: lquery
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( p < 0_${ik}$ .or. p > m-p ) then
              info = -2_${ik}$
           else if( q < 0_${ik}$ .or. q < p .or. m-q < p ) then
              info = -3_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -5_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -7_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ilarf = 2_${ik}$
              llarf = max( p-1, m-p, q-1 )
              iorbdb5 = 2_${ik}$
              lorbdb5 = q-1
              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                info = -14_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNBDB2', -info )
              return
           else if( lquery ) then
              return
           end if
           ! reduce rows 1, ..., p of x11 and x21
           do i = 1, p
              if( i > 1_${ik}$ ) then
                 call stdlib${ii}$_${ci}$drot( q-i+1, x11(i,i), ldx11, x21(i-1,i), ldx21, c,s )
              end if
              call stdlib${ii}$_${ci}$lacgv( q-i+1, x11(i,i), ldx11 )
              call stdlib${ii}$_${ci}$larfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
              c = real( x11(i,i),KIND=${ck}$)
              x11(i,i) = cone
              call stdlib${ii}$_${ci}$larf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_${ci}$larf( 'R', m-p-i+1, q-i+1, x11(i,i), ldx11, tauq1(i),x21(i,i), ldx21, &
                        work(ilarf) )
              call stdlib${ii}$_${ci}$lacgv( q-i+1, x11(i,i), ldx11 )
              s = sqrt( stdlib${ii}$_${c2ri(ci)}$znrm2( p-i, x11(i+1,i), 1_${ik}$ )**2_${ik}$+ stdlib${ii}$_${c2ri(ci)}$znrm2( m-p-i+1, x21(i,i), &
                        1_${ik}$ )**2_${ik}$ )
              theta(i) = atan2( s, c )
              call stdlib${ii}$_${ci}$unbdb5( p-i, m-p-i+1, q-i, x11(i+1,i), 1_${ik}$, x21(i,i), 1_${ik}$,x11(i+1,i+1), &
                        ldx11, x21(i,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
              call stdlib${ii}$_${ci}$scal( p-i, cnegone, x11(i+1,i), 1_${ik}$ )
              call stdlib${ii}$_${ci}$larfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1_${ik}$, taup2(i) )
              if( i < p ) then
                 call stdlib${ii}$_${ci}$larfgp( p-i, x11(i+1,i), x11(i+2,i), 1_${ik}$, taup1(i) )
                 phi(i) = atan2( real( x11(i+1,i),KIND=${ck}$), real( x21(i,i),KIND=${ck}$) )
                 c = cos( phi(i) )
                 s = sin( phi(i) )
                 x11(i+1,i) = cone
                 call stdlib${ii}$_${ci}$larf( 'L', p-i, q-i, x11(i+1,i), 1_${ik}$, conjg(taup1(i)),x11(i+1,i+1), &
                           ldx11, work(ilarf) )
              end if
              x21(i,i) = cone
              call stdlib${ii}$_${ci}$larf( 'L', m-p-i+1, q-i, x21(i,i), 1_${ik}$, conjg(taup2(i)),x21(i,i+1), &
                        ldx21, work(ilarf) )
           end do
           ! reduce the bottom-right portion of x21 to the identity matrix
           do i = p + 1, q
              call stdlib${ii}$_${ci}$larfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1_${ik}$, taup2(i) )
              x21(i,i) = cone
              call stdlib${ii}$_${ci}$larf( 'L', m-p-i+1, q-i, x21(i,i), 1_${ik}$, conjg(taup2(i)),x21(i,i+1), &
                        ldx21, work(ilarf) )
           end do
           return
     end subroutine stdlib${ii}$_${ci}$unbdb2

#:endif
#:endfor



     module subroutine stdlib${ii}$_cunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
     !! CUNBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny
     !! matrix X with orthonomal columns:
     !! [ B11 ]
     !! [ X11 ]   [ P1 |    ] [  0  ]
     !! [-----] = [---------] [-----] Q1**T .
     !! [ X21 ]   [    | P2 ] [ B21 ]
     !! [  0  ]
     !! X11 is P-by-Q, and X21 is (M-P)-by-Q. M-P must be no larger than P,
     !! Q, or M-Q. Routines CUNBDB1, CUNBDB2, and CUNBDB4 handle cases in
     !! which M-P is not the minimum dimension.
     !! The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
     !! and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
     !! Householder vectors.
     !! B11 and B12 are (M-P)-by-(M-P) bidiagonal matrices represented
     !! implicitly by angles THETA, PHI.
               work, lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lwork, m, p, q, ldx11, ldx21
           ! Array Arguments 
           real(sp), intent(out) :: phi(*), theta(*)
           complex(sp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
           complex(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
        ! ====================================================================
           
           ! Local Scalars 
           real(sp) :: c, s
           integer(${ik}$) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
                     lworkopt
           logical(lk) :: lquery
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( 2_${ik}$*p < m .or. p > m ) then
              info = -2_${ik}$
           else if( q < m-p .or. m-q < m-p ) then
              info = -3_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -5_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -7_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ilarf = 2_${ik}$
              llarf = max( p, m-p-1, q-1 )
              iorbdb5 = 2_${ik}$
              lorbdb5 = q-1
              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                info = -14_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'CUNBDB3', -info )
              return
           else if( lquery ) then
              return
           end if
           ! reduce rows 1, ..., m-p of x11 and x21
           do i = 1, m-p
              if( i > 1_${ik}$ ) then
                 call stdlib${ii}$_csrot( q-i+1, x11(i-1,i), ldx11, x21(i,i), ldx11, c,s )
              end if
              call stdlib${ii}$_clacgv( q-i+1, x21(i,i), ldx21 )
              call stdlib${ii}$_clarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
              s = real( x21(i,i),KIND=sp)
              x21(i,i) = cone
              call stdlib${ii}$_clarf( 'R', p-i+1, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i,i), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_clarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
                        work(ilarf) )
              call stdlib${ii}$_clacgv( q-i+1, x21(i,i), ldx21 )
              c = sqrt( stdlib${ii}$_scnrm2( p-i+1, x11(i,i), 1_${ik}$ )**2_${ik}$+ stdlib${ii}$_scnrm2( m-p-i, x21(i+1,i), &
                        1_${ik}$ )**2_${ik}$ )
              theta(i) = atan2( s, c )
              call stdlib${ii}$_cunbdb5( p-i+1, m-p-i, q-i, x11(i,i), 1_${ik}$, x21(i+1,i), 1_${ik}$,x11(i,i+1), &
                        ldx11, x21(i+1,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
              call stdlib${ii}$_clarfgp( p-i+1, x11(i,i), x11(i+1,i), 1_${ik}$, taup1(i) )
              if( i < m-p ) then
                 call stdlib${ii}$_clarfgp( m-p-i, x21(i+1,i), x21(i+2,i), 1_${ik}$, taup2(i) )
                 phi(i) = atan2( real( x21(i+1,i),KIND=sp), real( x11(i,i),KIND=sp) )
                 c = cos( phi(i) )
                 s = sin( phi(i) )
                 x21(i+1,i) = cone
                 call stdlib${ii}$_clarf( 'L', m-p-i, q-i, x21(i+1,i), 1_${ik}$, conjg(taup2(i)),x21(i+1,i+1), &
                           ldx21, work(ilarf) )
              end if
              x11(i,i) = cone
              call stdlib${ii}$_clarf( 'L', p-i+1, q-i, x11(i,i), 1_${ik}$, conjg(taup1(i)),x11(i,i+1), ldx11, &
                        work(ilarf) )
           end do
           ! reduce the bottom-right portion of x11 to the identity matrix
           do i = m-p + 1, q
              call stdlib${ii}$_clarfgp( p-i+1, x11(i,i), x11(i+1,i), 1_${ik}$, taup1(i) )
              x11(i,i) = cone
              call stdlib${ii}$_clarf( 'L', p-i+1, q-i, x11(i,i), 1_${ik}$, conjg(taup1(i)),x11(i,i+1), ldx11, &
                        work(ilarf) )
           end do
           return
     end subroutine stdlib${ii}$_cunbdb3

     module subroutine stdlib${ii}$_zunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
     !! ZUNBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny
     !! matrix X with orthonomal columns:
     !! [ B11 ]
     !! [ X11 ]   [ P1 |    ] [  0  ]
     !! [-----] = [---------] [-----] Q1**T .
     !! [ X21 ]   [    | P2 ] [ B21 ]
     !! [  0  ]
     !! X11 is P-by-Q, and X21 is (M-P)-by-Q. M-P must be no larger than P,
     !! Q, or M-Q. Routines ZUNBDB1, ZUNBDB2, and ZUNBDB4 handle cases in
     !! which M-P is not the minimum dimension.
     !! The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
     !! and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
     !! Householder vectors.
     !! B11 and B12 are (M-P)-by-(M-P) bidiagonal matrices represented
     !! implicitly by angles THETA, PHI.
               work, lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lwork, m, p, q, ldx11, ldx21
           ! Array Arguments 
           real(dp), intent(out) :: phi(*), theta(*)
           complex(dp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
           complex(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
        ! ====================================================================
           
           ! Local Scalars 
           real(dp) :: c, s
           integer(${ik}$) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
                     lworkopt
           logical(lk) :: lquery
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( 2_${ik}$*p < m .or. p > m ) then
              info = -2_${ik}$
           else if( q < m-p .or. m-q < m-p ) then
              info = -3_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -5_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -7_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ilarf = 2_${ik}$
              llarf = max( p, m-p-1, q-1 )
              iorbdb5 = 2_${ik}$
              lorbdb5 = q-1
              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                info = -14_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNBDB3', -info )
              return
           else if( lquery ) then
              return
           end if
           ! reduce rows 1, ..., m-p of x11 and x21
           do i = 1, m-p
              if( i > 1_${ik}$ ) then
                 call stdlib${ii}$_zdrot( q-i+1, x11(i-1,i), ldx11, x21(i,i), ldx11, c,s )
              end if
              call stdlib${ii}$_zlacgv( q-i+1, x21(i,i), ldx21 )
              call stdlib${ii}$_zlarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
              s = real( x21(i,i),KIND=dp)
              x21(i,i) = cone
              call stdlib${ii}$_zlarf( 'R', p-i+1, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i,i), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_zlarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
                        work(ilarf) )
              call stdlib${ii}$_zlacgv( q-i+1, x21(i,i), ldx21 )
              c = sqrt( stdlib${ii}$_dznrm2( p-i+1, x11(i,i), 1_${ik}$ )**2_${ik}$+ stdlib${ii}$_dznrm2( m-p-i, x21(i+1,i), &
                        1_${ik}$ )**2_${ik}$ )
              theta(i) = atan2( s, c )
              call stdlib${ii}$_zunbdb5( p-i+1, m-p-i, q-i, x11(i,i), 1_${ik}$, x21(i+1,i), 1_${ik}$,x11(i,i+1), &
                        ldx11, x21(i+1,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
              call stdlib${ii}$_zlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1_${ik}$, taup1(i) )
              if( i < m-p ) then
                 call stdlib${ii}$_zlarfgp( m-p-i, x21(i+1,i), x21(i+2,i), 1_${ik}$, taup2(i) )
                 phi(i) = atan2( real( x21(i+1,i),KIND=dp), real( x11(i,i),KIND=dp) )
                 c = cos( phi(i) )
                 s = sin( phi(i) )
                 x21(i+1,i) = cone
                 call stdlib${ii}$_zlarf( 'L', m-p-i, q-i, x21(i+1,i), 1_${ik}$,conjg(taup2(i)), x21(i+1,i+1), &
                           ldx21,work(ilarf) )
              end if
              x11(i,i) = cone
              call stdlib${ii}$_zlarf( 'L', p-i+1, q-i, x11(i,i), 1_${ik}$, conjg(taup1(i)),x11(i,i+1), ldx11, &
                        work(ilarf) )
           end do
           ! reduce the bottom-right portion of x11 to the identity matrix
           do i = m-p + 1, q
              call stdlib${ii}$_zlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1_${ik}$, taup1(i) )
              x11(i,i) = cone
              call stdlib${ii}$_zlarf( 'L', p-i+1, q-i, x11(i,i), 1_${ik}$, conjg(taup1(i)),x11(i,i+1), ldx11, &
                        work(ilarf) )
           end do
           return
     end subroutine stdlib${ii}$_zunbdb3

#:for ck,ct,ci in CMPLX_KINDS_TYPES
#:if not ck in ["sp","dp"]
     module subroutine stdlib${ii}$_${ci}$unbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
     !! ZUNBDB3: simultaneously bidiagonalizes the blocks of a tall and skinny
     !! matrix X with orthonomal columns:
     !! [ B11 ]
     !! [ X11 ]   [ P1 |    ] [  0  ]
     !! [-----] = [---------] [-----] Q1**T .
     !! [ X21 ]   [    | P2 ] [ B21 ]
     !! [  0  ]
     !! X11 is P-by-Q, and X21 is (M-P)-by-Q. M-P must be no larger than P,
     !! Q, or M-Q. Routines ZUNBDB1, ZUNBDB2, and ZUNBDB4 handle cases in
     !! which M-P is not the minimum dimension.
     !! The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
     !! and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
     !! Householder vectors.
     !! B11 and B12 are (M-P)-by-(M-P) bidiagonal matrices represented
     !! implicitly by angles THETA, PHI.
               work, lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${ck}$, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lwork, m, p, q, ldx11, ldx21
           ! Array Arguments 
           real(${ck}$), intent(out) :: phi(*), theta(*)
           complex(${ck}$), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
           complex(${ck}$), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
        ! ====================================================================
           
           ! Local Scalars 
           real(${ck}$) :: c, s
           integer(${ik}$) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
                     lworkopt
           logical(lk) :: lquery
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( 2_${ik}$*p < m .or. p > m ) then
              info = -2_${ik}$
           else if( q < m-p .or. m-q < m-p ) then
              info = -3_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -5_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -7_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ilarf = 2_${ik}$
              llarf = max( p, m-p-1, q-1 )
              iorbdb5 = 2_${ik}$
              lorbdb5 = q-1
              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                info = -14_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNBDB3', -info )
              return
           else if( lquery ) then
              return
           end if
           ! reduce rows 1, ..., m-p of x11 and x21
           do i = 1, m-p
              if( i > 1_${ik}$ ) then
                 call stdlib${ii}$_${ci}$drot( q-i+1, x11(i-1,i), ldx11, x21(i,i), ldx11, c,s )
              end if
              call stdlib${ii}$_${ci}$lacgv( q-i+1, x21(i,i), ldx21 )
              call stdlib${ii}$_${ci}$larfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
              s = real( x21(i,i),KIND=${ck}$)
              x21(i,i) = cone
              call stdlib${ii}$_${ci}$larf( 'R', p-i+1, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i,i), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_${ci}$larf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
                        work(ilarf) )
              call stdlib${ii}$_${ci}$lacgv( q-i+1, x21(i,i), ldx21 )
              c = sqrt( stdlib${ii}$_${c2ri(ci)}$znrm2( p-i+1, x11(i,i), 1_${ik}$ )**2_${ik}$+ stdlib${ii}$_${c2ri(ci)}$znrm2( m-p-i, x21(i+1,i), &
                        1_${ik}$ )**2_${ik}$ )
              theta(i) = atan2( s, c )
              call stdlib${ii}$_${ci}$unbdb5( p-i+1, m-p-i, q-i, x11(i,i), 1_${ik}$, x21(i+1,i), 1_${ik}$,x11(i,i+1), &
                        ldx11, x21(i+1,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
              call stdlib${ii}$_${ci}$larfgp( p-i+1, x11(i,i), x11(i+1,i), 1_${ik}$, taup1(i) )
              if( i < m-p ) then
                 call stdlib${ii}$_${ci}$larfgp( m-p-i, x21(i+1,i), x21(i+2,i), 1_${ik}$, taup2(i) )
                 phi(i) = atan2( real( x21(i+1,i),KIND=${ck}$), real( x11(i,i),KIND=${ck}$) )
                 c = cos( phi(i) )
                 s = sin( phi(i) )
                 x21(i+1,i) = cone
                 call stdlib${ii}$_${ci}$larf( 'L', m-p-i, q-i, x21(i+1,i), 1_${ik}$,conjg(taup2(i)), x21(i+1,i+1), &
                           ldx21,work(ilarf) )
              end if
              x11(i,i) = cone
              call stdlib${ii}$_${ci}$larf( 'L', p-i+1, q-i, x11(i,i), 1_${ik}$, conjg(taup1(i)),x11(i,i+1), ldx11, &
                        work(ilarf) )
           end do
           ! reduce the bottom-right portion of x11 to the identity matrix
           do i = m-p + 1, q
              call stdlib${ii}$_${ci}$larfgp( p-i+1, x11(i,i), x11(i+1,i), 1_${ik}$, taup1(i) )
              x11(i,i) = cone
              call stdlib${ii}$_${ci}$larf( 'L', p-i+1, q-i, x11(i,i), 1_${ik}$, conjg(taup1(i)),x11(i,i+1), ldx11, &
                        work(ilarf) )
           end do
           return
     end subroutine stdlib${ii}$_${ci}$unbdb3

#:endif
#:endfor



     module subroutine stdlib${ii}$_cunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
     !! CUNBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny
     !! matrix X with orthonomal columns:
     !! [ B11 ]
     !! [ X11 ]   [ P1 |    ] [  0  ]
     !! [-----] = [---------] [-----] Q1**T .
     !! [ X21 ]   [    | P2 ] [ B21 ]
     !! [  0  ]
     !! X11 is P-by-Q, and X21 is (M-P)-by-Q. M-Q must be no larger than P,
     !! M-P, or Q. Routines CUNBDB1, CUNBDB2, and CUNBDB3 handle cases in
     !! which M-Q is not the minimum dimension.
     !! The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
     !! and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
     !! Householder vectors.
     !! B11 and B12 are (M-Q)-by-(M-Q) bidiagonal matrices represented
     !! implicitly by angles THETA, PHI.
               phantom, work, lwork,info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lwork, m, p, q, ldx11, ldx21
           ! Array Arguments 
           real(sp), intent(out) :: phi(*), theta(*)
           complex(sp), intent(out) :: phantom(*), taup1(*), taup2(*), tauq1(*), work(*)
           complex(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
        ! ====================================================================
           
           ! Local Scalars 
           real(sp) :: c, s
           integer(${ik}$) :: childinfo, i, ilarf, iorbdb5, j, llarf, lorbdb5, lworkmin, &
                     lworkopt
           logical(lk) :: lquery
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( p < m-q .or. m-p < m-q ) then
              info = -2_${ik}$
           else if( q < m-q .or. q > m ) then
              info = -3_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -5_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -7_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ilarf = 2_${ik}$
              llarf = max( q-1, p-1, m-p-1 )
              iorbdb5 = 2_${ik}$
              lorbdb5 = q
              lworkopt = ilarf + llarf - 1_${ik}$
              lworkopt = max( lworkopt, iorbdb5 + lorbdb5 - 1_${ik}$ )
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                info = -14_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'CUNBDB4', -info )
              return
           else if( lquery ) then
              return
           end if
           ! reduce columns 1, ..., m-q of x11 and x21
           do i = 1, m-q
              if( i == 1_${ik}$ ) then
                 do j = 1, m
                    phantom(j) = czero
                 end do
                 call stdlib${ii}$_cunbdb5( p, m-p, q, phantom(1_${ik}$), 1_${ik}$, phantom(p+1), 1_${ik}$,x11, ldx11, x21, &
                           ldx21, work(iorbdb5),lorbdb5, childinfo )
                 call stdlib${ii}$_cscal( p, cnegone, phantom(1_${ik}$), 1_${ik}$ )
                 call stdlib${ii}$_clarfgp( p, phantom(1_${ik}$), phantom(2_${ik}$), 1_${ik}$, taup1(1_${ik}$) )
                 call stdlib${ii}$_clarfgp( m-p, phantom(p+1), phantom(p+2), 1_${ik}$, taup2(1_${ik}$) )
                 theta(i) = atan2( real( phantom(1_${ik}$),KIND=sp), real( phantom(p+1),KIND=sp) )
                           
                 c = cos( theta(i) )
                 s = sin( theta(i) )
                 phantom(1_${ik}$) = cone
                 phantom(p+1) = cone
                 call stdlib${ii}$_clarf( 'L', p, q, phantom(1_${ik}$), 1_${ik}$, conjg(taup1(1_${ik}$)), x11,ldx11, work(&
                           ilarf) )
                 call stdlib${ii}$_clarf( 'L', m-p, q, phantom(p+1), 1_${ik}$, conjg(taup2(1_${ik}$)),x21, ldx21, &
                           work(ilarf) )
              else
                 call stdlib${ii}$_cunbdb5( p-i+1, m-p-i+1, q-i+1, x11(i,i-1), 1_${ik}$,x21(i,i-1), 1_${ik}$, x11(i,i)&
                           , ldx11, x21(i,i),ldx21, work(iorbdb5), lorbdb5, childinfo )
                 call stdlib${ii}$_cscal( p-i+1, cnegone, x11(i,i-1), 1_${ik}$ )
                 call stdlib${ii}$_clarfgp( p-i+1, x11(i,i-1), x11(i+1,i-1), 1_${ik}$, taup1(i) )
                 call stdlib${ii}$_clarfgp( m-p-i+1, x21(i,i-1), x21(i+1,i-1), 1_${ik}$,taup2(i) )
                 theta(i) = atan2( real( x11(i,i-1),KIND=sp), real( x21(i,i-1),KIND=sp) )
                 c = cos( theta(i) )
                 s = sin( theta(i) )
                 x11(i,i-1) = cone
                 x21(i,i-1) = cone
                 call stdlib${ii}$_clarf( 'L', p-i+1, q-i+1, x11(i,i-1), 1_${ik}$,conjg(taup1(i)), x11(i,i), &
                           ldx11, work(ilarf) )
                 call stdlib${ii}$_clarf( 'L', m-p-i+1, q-i+1, x21(i,i-1), 1_${ik}$,conjg(taup2(i)), x21(i,i), &
                           ldx21, work(ilarf) )
              end if
              call stdlib${ii}$_csrot( q-i+1, x11(i,i), ldx11, x21(i,i), ldx21, s, -c )
              call stdlib${ii}$_clacgv( q-i+1, x21(i,i), ldx21 )
              call stdlib${ii}$_clarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
              c = real( x21(i,i),KIND=sp)
              x21(i,i) = cone
              call stdlib${ii}$_clarf( 'R', p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i+1,i), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_clarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
                        work(ilarf) )
              call stdlib${ii}$_clacgv( q-i+1, x21(i,i), ldx21 )
              if( i < m-q ) then
                 s = sqrt( stdlib${ii}$_scnrm2( p-i, x11(i+1,i), 1_${ik}$ )**2_${ik}$+ stdlib${ii}$_scnrm2( m-p-i, x21(i+1,&
                           i), 1_${ik}$ )**2_${ik}$ )
                 phi(i) = atan2( s, c )
              end if
           end do
           ! reduce the bottom-right portion of x11 to [ i 0 ]
           do i = m - q + 1, p
              call stdlib${ii}$_clacgv( q-i+1, x11(i,i), ldx11 )
              call stdlib${ii}$_clarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
              x11(i,i) = cone
              call stdlib${ii}$_clarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_clarf( 'R', q-p, q-i+1, x11(i,i), ldx11, tauq1(i),x21(m-q+1,i), ldx21, &
                        work(ilarf) )
              call stdlib${ii}$_clacgv( q-i+1, x11(i,i), ldx11 )
           end do
           ! reduce the bottom-right portion of x21 to [ 0 i ]
           do i = p + 1, q
              call stdlib${ii}$_clacgv( q-i+1, x21(m-q+i-p,i), ldx21 )
              call stdlib${ii}$_clarfgp( q-i+1, x21(m-q+i-p,i), x21(m-q+i-p,i+1), ldx21,tauq1(i) )
                        
              x21(m-q+i-p,i) = cone
              call stdlib${ii}$_clarf( 'R', q-i, q-i+1, x21(m-q+i-p,i), ldx21, tauq1(i),x21(m-q+i-p+1,i)&
                        , ldx21, work(ilarf) )
              call stdlib${ii}$_clacgv( q-i+1, x21(m-q+i-p,i), ldx21 )
           end do
           return
     end subroutine stdlib${ii}$_cunbdb4

     module subroutine stdlib${ii}$_zunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
     !! ZUNBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny
     !! matrix X with orthonomal columns:
     !! [ B11 ]
     !! [ X11 ]   [ P1 |    ] [  0  ]
     !! [-----] = [---------] [-----] Q1**T .
     !! [ X21 ]   [    | P2 ] [ B21 ]
     !! [  0  ]
     !! X11 is P-by-Q, and X21 is (M-P)-by-Q. M-Q must be no larger than P,
     !! M-P, or Q. Routines ZUNBDB1, ZUNBDB2, and ZUNBDB3 handle cases in
     !! which M-Q is not the minimum dimension.
     !! The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
     !! and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
     !! Householder vectors.
     !! B11 and B12 are (M-Q)-by-(M-Q) bidiagonal matrices represented
     !! implicitly by angles THETA, PHI.
               phantom, work, lwork,info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lwork, m, p, q, ldx11, ldx21
           ! Array Arguments 
           real(dp), intent(out) :: phi(*), theta(*)
           complex(dp), intent(out) :: phantom(*), taup1(*), taup2(*), tauq1(*), work(*)
           complex(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
        ! ====================================================================
           
           ! Local Scalars 
           real(dp) :: c, s
           integer(${ik}$) :: childinfo, i, ilarf, iorbdb5, j, llarf, lorbdb5, lworkmin, &
                     lworkopt
           logical(lk) :: lquery
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( p < m-q .or. m-p < m-q ) then
              info = -2_${ik}$
           else if( q < m-q .or. q > m ) then
              info = -3_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -5_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -7_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ilarf = 2_${ik}$
              llarf = max( q-1, p-1, m-p-1 )
              iorbdb5 = 2_${ik}$
              lorbdb5 = q
              lworkopt = ilarf + llarf - 1_${ik}$
              lworkopt = max( lworkopt, iorbdb5 + lorbdb5 - 1_${ik}$ )
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                info = -14_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNBDB4', -info )
              return
           else if( lquery ) then
              return
           end if
           ! reduce columns 1, ..., m-q of x11 and x21
           do i = 1, m-q
              if( i == 1_${ik}$ ) then
                 do j = 1, m
                    phantom(j) = czero
                 end do
                 call stdlib${ii}$_zunbdb5( p, m-p, q, phantom(1_${ik}$), 1_${ik}$, phantom(p+1), 1_${ik}$,x11, ldx11, x21, &
                           ldx21, work(iorbdb5),lorbdb5, childinfo )
                 call stdlib${ii}$_zscal( p, cnegone, phantom(1_${ik}$), 1_${ik}$ )
                 call stdlib${ii}$_zlarfgp( p, phantom(1_${ik}$), phantom(2_${ik}$), 1_${ik}$, taup1(1_${ik}$) )
                 call stdlib${ii}$_zlarfgp( m-p, phantom(p+1), phantom(p+2), 1_${ik}$, taup2(1_${ik}$) )
                 theta(i) = atan2( real( phantom(1_${ik}$),KIND=dp), real( phantom(p+1),KIND=dp) )
                           
                 c = cos( theta(i) )
                 s = sin( theta(i) )
                 phantom(1_${ik}$) = cone
                 phantom(p+1) = cone
                 call stdlib${ii}$_zlarf( 'L', p, q, phantom(1_${ik}$), 1_${ik}$, conjg(taup1(1_${ik}$)), x11,ldx11, work(&
                           ilarf) )
                 call stdlib${ii}$_zlarf( 'L', m-p, q, phantom(p+1), 1_${ik}$, conjg(taup2(1_${ik}$)),x21, ldx21, &
                           work(ilarf) )
              else
                 call stdlib${ii}$_zunbdb5( p-i+1, m-p-i+1, q-i+1, x11(i,i-1), 1_${ik}$,x21(i,i-1), 1_${ik}$, x11(i,i)&
                           , ldx11, x21(i,i),ldx21, work(iorbdb5), lorbdb5, childinfo )
                 call stdlib${ii}$_zscal( p-i+1, cnegone, x11(i,i-1), 1_${ik}$ )
                 call stdlib${ii}$_zlarfgp( p-i+1, x11(i,i-1), x11(i+1,i-1), 1_${ik}$, taup1(i) )
                 call stdlib${ii}$_zlarfgp( m-p-i+1, x21(i,i-1), x21(i+1,i-1), 1_${ik}$,taup2(i) )
                 theta(i) = atan2( real( x11(i,i-1),KIND=dp), real( x21(i,i-1),KIND=dp) )
                 c = cos( theta(i) )
                 s = sin( theta(i) )
                 x11(i,i-1) = cone
                 x21(i,i-1) = cone
                 call stdlib${ii}$_zlarf( 'L', p-i+1, q-i+1, x11(i,i-1), 1_${ik}$,conjg(taup1(i)), x11(i,i), &
                           ldx11, work(ilarf) )
                 call stdlib${ii}$_zlarf( 'L', m-p-i+1, q-i+1, x21(i,i-1), 1_${ik}$,conjg(taup2(i)), x21(i,i), &
                           ldx21, work(ilarf) )
              end if
              call stdlib${ii}$_zdrot( q-i+1, x11(i,i), ldx11, x21(i,i), ldx21, s, -c )
              call stdlib${ii}$_zlacgv( q-i+1, x21(i,i), ldx21 )
              call stdlib${ii}$_zlarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
              c = real( x21(i,i),KIND=dp)
              x21(i,i) = cone
              call stdlib${ii}$_zlarf( 'R', p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i+1,i), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_zlarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
                        work(ilarf) )
              call stdlib${ii}$_zlacgv( q-i+1, x21(i,i), ldx21 )
              if( i < m-q ) then
                 s = sqrt( stdlib${ii}$_dznrm2( p-i, x11(i+1,i), 1_${ik}$ )**2_${ik}$+ stdlib${ii}$_dznrm2( m-p-i, x21(i+1,&
                           i), 1_${ik}$ )**2_${ik}$ )
                 phi(i) = atan2( s, c )
              end if
           end do
           ! reduce the bottom-right portion of x11 to [ i 0 ]
           do i = m - q + 1, p
              call stdlib${ii}$_zlacgv( q-i+1, x11(i,i), ldx11 )
              call stdlib${ii}$_zlarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
              x11(i,i) = cone
              call stdlib${ii}$_zlarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_zlarf( 'R', q-p, q-i+1, x11(i,i), ldx11, tauq1(i),x21(m-q+1,i), ldx21, &
                        work(ilarf) )
              call stdlib${ii}$_zlacgv( q-i+1, x11(i,i), ldx11 )
           end do
           ! reduce the bottom-right portion of x21 to [ 0 i ]
           do i = p + 1, q
              call stdlib${ii}$_zlacgv( q-i+1, x21(m-q+i-p,i), ldx21 )
              call stdlib${ii}$_zlarfgp( q-i+1, x21(m-q+i-p,i), x21(m-q+i-p,i+1), ldx21,tauq1(i) )
                        
              x21(m-q+i-p,i) = cone
              call stdlib${ii}$_zlarf( 'R', q-i, q-i+1, x21(m-q+i-p,i), ldx21, tauq1(i),x21(m-q+i-p+1,i)&
                        , ldx21, work(ilarf) )
              call stdlib${ii}$_zlacgv( q-i+1, x21(m-q+i-p,i), ldx21 )
           end do
           return
     end subroutine stdlib${ii}$_zunbdb4

#:for ck,ct,ci in CMPLX_KINDS_TYPES
#:if not ck in ["sp","dp"]
     module subroutine stdlib${ii}$_${ci}$unbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
     !! ZUNBDB4: simultaneously bidiagonalizes the blocks of a tall and skinny
     !! matrix X with orthonomal columns:
     !! [ B11 ]
     !! [ X11 ]   [ P1 |    ] [  0  ]
     !! [-----] = [---------] [-----] Q1**T .
     !! [ X21 ]   [    | P2 ] [ B21 ]
     !! [  0  ]
     !! X11 is P-by-Q, and X21 is (M-P)-by-Q. M-Q must be no larger than P,
     !! M-P, or Q. Routines ZUNBDB1, ZUNBDB2, and ZUNBDB3 handle cases in
     !! which M-Q is not the minimum dimension.
     !! The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
     !! and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
     !! Householder vectors.
     !! B11 and B12 are (M-Q)-by-(M-Q) bidiagonal matrices represented
     !! implicitly by angles THETA, PHI.
               phantom, work, lwork,info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${ck}$, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lwork, m, p, q, ldx11, ldx21
           ! Array Arguments 
           real(${ck}$), intent(out) :: phi(*), theta(*)
           complex(${ck}$), intent(out) :: phantom(*), taup1(*), taup2(*), tauq1(*), work(*)
           complex(${ck}$), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
        ! ====================================================================
           
           ! Local Scalars 
           real(${ck}$) :: c, s
           integer(${ik}$) :: childinfo, i, ilarf, iorbdb5, j, llarf, lorbdb5, lworkmin, &
                     lworkopt
           logical(lk) :: lquery
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           if( m < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( p < m-q .or. m-p < m-q ) then
              info = -2_${ik}$
           else if( q < m-q .or. q > m ) then
              info = -3_${ik}$
           else if( ldx11 < max( 1_${ik}$, p ) ) then
              info = -5_${ik}$
           else if( ldx21 < max( 1_${ik}$, m-p ) ) then
              info = -7_${ik}$
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              ilarf = 2_${ik}$
              llarf = max( q-1, p-1, m-p-1 )
              iorbdb5 = 2_${ik}$
              lorbdb5 = q
              lworkopt = ilarf + llarf - 1_${ik}$
              lworkopt = max( lworkopt, iorbdb5 + lorbdb5 - 1_${ik}$ )
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not.lquery ) then
                info = -14_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNBDB4', -info )
              return
           else if( lquery ) then
              return
           end if
           ! reduce columns 1, ..., m-q of x11 and x21
           do i = 1, m-q
              if( i == 1_${ik}$ ) then
                 do j = 1, m
                    phantom(j) = czero
                 end do
                 call stdlib${ii}$_${ci}$unbdb5( p, m-p, q, phantom(1_${ik}$), 1_${ik}$, phantom(p+1), 1_${ik}$,x11, ldx11, x21, &
                           ldx21, work(iorbdb5),lorbdb5, childinfo )
                 call stdlib${ii}$_${ci}$scal( p, cnegone, phantom(1_${ik}$), 1_${ik}$ )
                 call stdlib${ii}$_${ci}$larfgp( p, phantom(1_${ik}$), phantom(2_${ik}$), 1_${ik}$, taup1(1_${ik}$) )
                 call stdlib${ii}$_${ci}$larfgp( m-p, phantom(p+1), phantom(p+2), 1_${ik}$, taup2(1_${ik}$) )
                 theta(i) = atan2( real( phantom(1_${ik}$),KIND=${ck}$), real( phantom(p+1),KIND=${ck}$) )
                           
                 c = cos( theta(i) )
                 s = sin( theta(i) )
                 phantom(1_${ik}$) = cone
                 phantom(p+1) = cone
                 call stdlib${ii}$_${ci}$larf( 'L', p, q, phantom(1_${ik}$), 1_${ik}$, conjg(taup1(1_${ik}$)), x11,ldx11, work(&
                           ilarf) )
                 call stdlib${ii}$_${ci}$larf( 'L', m-p, q, phantom(p+1), 1_${ik}$, conjg(taup2(1_${ik}$)),x21, ldx21, &
                           work(ilarf) )
              else
                 call stdlib${ii}$_${ci}$unbdb5( p-i+1, m-p-i+1, q-i+1, x11(i,i-1), 1_${ik}$,x21(i,i-1), 1_${ik}$, x11(i,i)&
                           , ldx11, x21(i,i),ldx21, work(iorbdb5), lorbdb5, childinfo )
                 call stdlib${ii}$_${ci}$scal( p-i+1, cnegone, x11(i,i-1), 1_${ik}$ )
                 call stdlib${ii}$_${ci}$larfgp( p-i+1, x11(i,i-1), x11(i+1,i-1), 1_${ik}$, taup1(i) )
                 call stdlib${ii}$_${ci}$larfgp( m-p-i+1, x21(i,i-1), x21(i+1,i-1), 1_${ik}$,taup2(i) )
                 theta(i) = atan2( real( x11(i,i-1),KIND=${ck}$), real( x21(i,i-1),KIND=${ck}$) )
                 c = cos( theta(i) )
                 s = sin( theta(i) )
                 x11(i,i-1) = cone
                 x21(i,i-1) = cone
                 call stdlib${ii}$_${ci}$larf( 'L', p-i+1, q-i+1, x11(i,i-1), 1_${ik}$,conjg(taup1(i)), x11(i,i), &
                           ldx11, work(ilarf) )
                 call stdlib${ii}$_${ci}$larf( 'L', m-p-i+1, q-i+1, x21(i,i-1), 1_${ik}$,conjg(taup2(i)), x21(i,i), &
                           ldx21, work(ilarf) )
              end if
              call stdlib${ii}$_${ci}$drot( q-i+1, x11(i,i), ldx11, x21(i,i), ldx21, s, -c )
              call stdlib${ii}$_${ci}$lacgv( q-i+1, x21(i,i), ldx21 )
              call stdlib${ii}$_${ci}$larfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
              c = real( x21(i,i),KIND=${ck}$)
              x21(i,i) = cone
              call stdlib${ii}$_${ci}$larf( 'R', p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i+1,i), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_${ci}$larf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
                        work(ilarf) )
              call stdlib${ii}$_${ci}$lacgv( q-i+1, x21(i,i), ldx21 )
              if( i < m-q ) then
                 s = sqrt( stdlib${ii}$_${c2ri(ci)}$znrm2( p-i, x11(i+1,i), 1_${ik}$ )**2_${ik}$+ stdlib${ii}$_${c2ri(ci)}$znrm2( m-p-i, x21(i+1,&
                           i), 1_${ik}$ )**2_${ik}$ )
                 phi(i) = atan2( s, c )
              end if
           end do
           ! reduce the bottom-right portion of x11 to [ i 0 ]
           do i = m - q + 1, p
              call stdlib${ii}$_${ci}$lacgv( q-i+1, x11(i,i), ldx11 )
              call stdlib${ii}$_${ci}$larfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
              x11(i,i) = cone
              call stdlib${ii}$_${ci}$larf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
                        work(ilarf) )
              call stdlib${ii}$_${ci}$larf( 'R', q-p, q-i+1, x11(i,i), ldx11, tauq1(i),x21(m-q+1,i), ldx21, &
                        work(ilarf) )
              call stdlib${ii}$_${ci}$lacgv( q-i+1, x11(i,i), ldx11 )
           end do
           ! reduce the bottom-right portion of x21 to [ 0 i ]
           do i = p + 1, q
              call stdlib${ii}$_${ci}$lacgv( q-i+1, x21(m-q+i-p,i), ldx21 )
              call stdlib${ii}$_${ci}$larfgp( q-i+1, x21(m-q+i-p,i), x21(m-q+i-p,i+1), ldx21,tauq1(i) )
                        
              x21(m-q+i-p,i) = cone
              call stdlib${ii}$_${ci}$larf( 'R', q-i, q-i+1, x21(m-q+i-p,i), ldx21, tauq1(i),x21(m-q+i-p+1,i)&
                        , ldx21, work(ilarf) )
              call stdlib${ii}$_${ci}$lacgv( q-i+1, x21(m-q+i-p,i), ldx21 )
           end do
           return
     end subroutine stdlib${ii}$_${ci}$unbdb4

#:endif
#:endfor



     pure module subroutine stdlib${ii}$_cunbdb5( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
     !! CUNBDB5 orthogonalizes the column vector
     !! X = [ X1 ]
     !! [ X2 ]
     !! with respect to the columns of
     !! Q = [ Q1 ] .
     !! [ Q2 ]
     !! The columns of Q must be orthonormal.
     !! If the projection is zero according to Kahan's "twice is enough"
     !! criterion, then some other vector from the orthogonal complement
     !! is returned. This vector is chosen in an arbitrary but deterministic
     !! way.
               lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
           integer(${ik}$), intent(out) :: info
           ! Array Arguments 
           complex(sp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
           complex(sp), intent(out) :: work(*)
           complex(sp), intent(inout) :: x1(*), x2(*)
        ! =====================================================================
           
           ! Local Scalars 
           integer(${ik}$) :: childinfo, i, j
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           if( m1 < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( m2 < 0_${ik}$ ) then
              info = -2_${ik}$
           else if( n < 0_${ik}$ ) then
              info = -3_${ik}$
           else if( incx1 < 1_${ik}$ ) then
              info = -5_${ik}$
           else if( incx2 < 1_${ik}$ ) then
              info = -7_${ik}$
           else if( ldq1 < max( 1_${ik}$, m1 ) ) then
              info = -9_${ik}$
           else if( ldq2 < max( 1_${ik}$, m2 ) ) then
              info = -11_${ik}$
           else if( lwork < n ) then
              info = -13_${ik}$
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'CUNBDB5', -info )
              return
           end if
           ! project x onto the orthogonal complement of q
           call stdlib${ii}$_cunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2, ldq2,work, lwork, &
                     childinfo )
           ! if the projection is nonzero, then return
           if( stdlib${ii}$_scnrm2(m1,x1,incx1) /= czero.or. stdlib${ii}$_scnrm2(m2,x2,incx2) /= czero ) &
                     then
              return
           end if
           ! project each standard basis vector e_1,...,e_m1 in turn, stopping
           ! when a nonzero projection is found
           do i = 1, m1
              do j = 1, m1
                 x1(j) = czero
              end do
              x1(i) = cone
              do j = 1, m2
                 x2(j) = czero
              end do
              call stdlib${ii}$_cunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
                        lwork, childinfo )
              if( stdlib${ii}$_scnrm2(m1,x1,incx1) /= czero.or. stdlib${ii}$_scnrm2(m2,x2,incx2) /= czero ) &
                        then
                 return
              end if
           end do
           ! project each standard basis vector e_(m1+1),...,e_(m1+m2) in turn,
           ! stopping when a nonzero projection is found
           do i = 1, m2
              do j = 1, m1
                 x1(j) = czero
              end do
              do j = 1, m2
                 x2(j) = czero
              end do
              x2(i) = cone
              call stdlib${ii}$_cunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
                        lwork, childinfo )
              if( stdlib${ii}$_scnrm2(m1,x1,incx1) /= czero.or. stdlib${ii}$_scnrm2(m2,x2,incx2) /= czero ) &
                        then
                 return
              end if
           end do
           return
     end subroutine stdlib${ii}$_cunbdb5

     pure module subroutine stdlib${ii}$_zunbdb5( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
     !! ZUNBDB5 orthogonalizes the column vector
     !! X = [ X1 ]
     !! [ X2 ]
     !! with respect to the columns of
     !! Q = [ Q1 ] .
     !! [ Q2 ]
     !! The columns of Q must be orthonormal.
     !! If the projection is zero according to Kahan's "twice is enough"
     !! criterion, then some other vector from the orthogonal complement
     !! is returned. This vector is chosen in an arbitrary but deterministic
     !! way.
               lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
           integer(${ik}$), intent(out) :: info
           ! Array Arguments 
           complex(dp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
           complex(dp), intent(out) :: work(*)
           complex(dp), intent(inout) :: x1(*), x2(*)
        ! =====================================================================
           
           ! Local Scalars 
           integer(${ik}$) :: childinfo, i, j
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           if( m1 < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( m2 < 0_${ik}$ ) then
              info = -2_${ik}$
           else if( n < 0_${ik}$ ) then
              info = -3_${ik}$
           else if( incx1 < 1_${ik}$ ) then
              info = -5_${ik}$
           else if( incx2 < 1_${ik}$ ) then
              info = -7_${ik}$
           else if( ldq1 < max( 1_${ik}$, m1 ) ) then
              info = -9_${ik}$
           else if( ldq2 < max( 1_${ik}$, m2 ) ) then
              info = -11_${ik}$
           else if( lwork < n ) then
              info = -13_${ik}$
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNBDB5', -info )
              return
           end if
           ! project x onto the orthogonal complement of q
           call stdlib${ii}$_zunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2, ldq2,work, lwork, &
                     childinfo )
           ! if the projection is nonzero, then return
           if( stdlib${ii}$_dznrm2(m1,x1,incx1) /= czero.or. stdlib${ii}$_dznrm2(m2,x2,incx2) /= czero ) &
                     then
              return
           end if
           ! project each standard basis vector e_1,...,e_m1 in turn, stopping
           ! when a nonzero projection is found
           do i = 1, m1
              do j = 1, m1
                 x1(j) = czero
              end do
              x1(i) = cone
              do j = 1, m2
                 x2(j) = czero
              end do
              call stdlib${ii}$_zunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
                        lwork, childinfo )
              if( stdlib${ii}$_dznrm2(m1,x1,incx1) /= czero.or. stdlib${ii}$_dznrm2(m2,x2,incx2) /= czero ) &
                        then
                 return
              end if
           end do
           ! project each standard basis vector e_(m1+1),...,e_(m1+m2) in turn,
           ! stopping when a nonzero projection is found
           do i = 1, m2
              do j = 1, m1
                 x1(j) = czero
              end do
              do j = 1, m2
                 x2(j) = czero
              end do
              x2(i) = cone
              call stdlib${ii}$_zunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
                        lwork, childinfo )
              if( stdlib${ii}$_dznrm2(m1,x1,incx1) /= czero.or. stdlib${ii}$_dznrm2(m2,x2,incx2) /= czero ) &
                        then
                 return
              end if
           end do
           return
     end subroutine stdlib${ii}$_zunbdb5

#:for ck,ct,ci in CMPLX_KINDS_TYPES
#:if not ck in ["sp","dp"]
     pure module subroutine stdlib${ii}$_${ci}$unbdb5( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
     !! ZUNBDB5: orthogonalizes the column vector
     !! X = [ X1 ]
     !! [ X2 ]
     !! with respect to the columns of
     !! Q = [ Q1 ] .
     !! [ Q2 ]
     !! The columns of Q must be orthonormal.
     !! If the projection is zero according to Kahan's "twice is enough"
     !! criterion, then some other vector from the orthogonal complement
     !! is returned. This vector is chosen in an arbitrary but deterministic
     !! way.
               lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${ck}$, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
           integer(${ik}$), intent(out) :: info
           ! Array Arguments 
           complex(${ck}$), intent(in) :: q1(ldq1,*), q2(ldq2,*)
           complex(${ck}$), intent(out) :: work(*)
           complex(${ck}$), intent(inout) :: x1(*), x2(*)
        ! =====================================================================
           
           ! Local Scalars 
           integer(${ik}$) :: childinfo, i, j
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           if( m1 < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( m2 < 0_${ik}$ ) then
              info = -2_${ik}$
           else if( n < 0_${ik}$ ) then
              info = -3_${ik}$
           else if( incx1 < 1_${ik}$ ) then
              info = -5_${ik}$
           else if( incx2 < 1_${ik}$ ) then
              info = -7_${ik}$
           else if( ldq1 < max( 1_${ik}$, m1 ) ) then
              info = -9_${ik}$
           else if( ldq2 < max( 1_${ik}$, m2 ) ) then
              info = -11_${ik}$
           else if( lwork < n ) then
              info = -13_${ik}$
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNBDB5', -info )
              return
           end if
           ! project x onto the orthogonal complement of q
           call stdlib${ii}$_${ci}$unbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2, ldq2,work, lwork, &
                     childinfo )
           ! if the projection is nonzero, then return
           if( stdlib${ii}$_${c2ri(ci)}$znrm2(m1,x1,incx1) /= czero.or. stdlib${ii}$_${c2ri(ci)}$znrm2(m2,x2,incx2) /= czero ) &
                     then
              return
           end if
           ! project each standard basis vector e_1,...,e_m1 in turn, stopping
           ! when a nonzero projection is found
           do i = 1, m1
              do j = 1, m1
                 x1(j) = czero
              end do
              x1(i) = cone
              do j = 1, m2
                 x2(j) = czero
              end do
              call stdlib${ii}$_${ci}$unbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
                        lwork, childinfo )
              if( stdlib${ii}$_${c2ri(ci)}$znrm2(m1,x1,incx1) /= czero.or. stdlib${ii}$_${c2ri(ci)}$znrm2(m2,x2,incx2) /= czero ) &
                        then
                 return
              end if
           end do
           ! project each standard basis vector e_(m1+1),...,e_(m1+m2) in turn,
           ! stopping when a nonzero projection is found
           do i = 1, m2
              do j = 1, m1
                 x1(j) = czero
              end do
              do j = 1, m2
                 x2(j) = czero
              end do
              x2(i) = cone
              call stdlib${ii}$_${ci}$unbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
                        lwork, childinfo )
              if( stdlib${ii}$_${c2ri(ci)}$znrm2(m1,x1,incx1) /= czero.or. stdlib${ii}$_${c2ri(ci)}$znrm2(m2,x2,incx2) /= czero ) &
                        then
                 return
              end if
           end do
           return
     end subroutine stdlib${ii}$_${ci}$unbdb5

#:endif
#:endfor



     pure module subroutine stdlib${ii}$_cunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, lwork, info )
     !! CUNBDB6 orthogonalizes the column vector
     !! X = [ X1 ]
     !! [ X2 ]
     !! with respect to the columns of
     !! Q = [ Q1 ] .
     !! [ Q2 ]
     !! The columns of Q must be orthonormal.
     !! If the projection is zero according to Kahan's "twice is enough"
     !! criterion, then the zero vector is returned.
               
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
           integer(${ik}$), intent(out) :: info
           ! Array Arguments 
           complex(sp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
           complex(sp), intent(out) :: work(*)
           complex(sp), intent(inout) :: x1(*), x2(*)
        ! =====================================================================
           ! Parameters 
           real(sp), parameter :: alphasq = 0.01_sp
           
           
           
           ! Local Scalars 
           integer(${ik}$) :: i
           real(sp) :: normsq1, normsq2, scl1, scl2, ssq1, ssq2
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           if( m1 < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( m2 < 0_${ik}$ ) then
              info = -2_${ik}$
           else if( n < 0_${ik}$ ) then
              info = -3_${ik}$
           else if( incx1 < 1_${ik}$ ) then
              info = -5_${ik}$
           else if( incx2 < 1_${ik}$ ) then
              info = -7_${ik}$
           else if( ldq1 < max( 1_${ik}$, m1 ) ) then
              info = -9_${ik}$
           else if( ldq2 < max( 1_${ik}$, m2 ) ) then
              info = -11_${ik}$
           else if( lwork < n ) then
              info = -13_${ik}$
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'CUNBDB6', -info )
              return
           end if
           ! first, project x onto the orthogonal complement of q's column
           ! space
           scl1 = zero
           ssq1 = one
           call stdlib${ii}$_classq( m1, x1, incx1, scl1, ssq1 )
           scl2 = zero
           ssq2 = one
           call stdlib${ii}$_classq( m2, x2, incx2, scl2, ssq2 )
           normsq1 = scl1**2_${ik}$*ssq1 + scl2**2_${ik}$*ssq2
           if( m1 == 0_${ik}$ ) then
              do i = 1, n
                 work(i) = czero
              end do
           else
              call stdlib${ii}$_cgemv( 'C', m1, n, cone, q1, ldq1, x1, incx1, czero, work,1_${ik}$ )
           end if
           call stdlib${ii}$_cgemv( 'C', m2, n, cone, q2, ldq2, x2, incx2, cone, work, 1_${ik}$ )
           call stdlib${ii}$_cgemv( 'N', m1, n, cnegone, q1, ldq1, work, 1_${ik}$, cone, x1,incx1 )
           call stdlib${ii}$_cgemv( 'N', m2, n, cnegone, q2, ldq2, work, 1_${ik}$, cone, x2,incx2 )
           scl1 = zero
           ssq1 = one
           call stdlib${ii}$_classq( m1, x1, incx1, scl1, ssq1 )
           scl2 = zero
           ssq2 = one
           call stdlib${ii}$_classq( m2, x2, incx2, scl2, ssq2 )
           normsq2 = scl1**2_${ik}$*ssq1 + scl2**2_${ik}$*ssq2
           ! if projection is sufficiently large in norm, then stop.
           ! if projection is czero, then stop.
           ! otherwise, project again.
           if( normsq2 >= alphasq*normsq1 ) then
              return
           end if
           if( normsq2 == czero ) then
              return
           end if
           normsq1 = normsq2
           do i = 1, n
              work(i) = czero
           end do
           if( m1 == 0_${ik}$ ) then
              do i = 1, n
                 work(i) = czero
              end do
           else
              call stdlib${ii}$_cgemv( 'C', m1, n, cone, q1, ldq1, x1, incx1, czero, work,1_${ik}$ )
           end if
           call stdlib${ii}$_cgemv( 'C', m2, n, cone, q2, ldq2, x2, incx2, cone, work, 1_${ik}$ )
           call stdlib${ii}$_cgemv( 'N', m1, n, cnegone, q1, ldq1, work, 1_${ik}$, cone, x1,incx1 )
           call stdlib${ii}$_cgemv( 'N', m2, n, cnegone, q2, ldq2, work, 1_${ik}$, cone, x2,incx2 )
           scl1 = zero
           ssq1 = one
           call stdlib${ii}$_classq( m1, x1, incx1, scl1, ssq1 )
           scl2 = zero
           ssq2 = one
           call stdlib${ii}$_classq( m1, x1, incx1, scl1, ssq1 )
           normsq2 = scl1**2_${ik}$*ssq1 + scl2**2_${ik}$*ssq2
           ! if second projection is sufficiently large in norm, then do
           ! nothing more. alternatively, if it shrunk significantly, then
           ! truncate it to czero.
           if( normsq2 < alphasq*normsq1 ) then
              do i = 1, m1
                 x1(i) = czero
              end do
              do i = 1, m2
                 x2(i) = czero
              end do
           end if
           return
     end subroutine stdlib${ii}$_cunbdb6

     pure module subroutine stdlib${ii}$_zunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
     !! ZUNBDB6 orthogonalizes the column vector
     !! X = [ X1 ]
     !! [ X2 ]
     !! with respect to the columns of
     !! Q = [ Q1 ] .
     !! [ Q2 ]
     !! The columns of Q must be orthonormal.
     !! If the projection is zero according to Kahan's "twice is enough"
     !! criterion, then the zero vector is returned.
               lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
           integer(${ik}$), intent(out) :: info
           ! Array Arguments 
           complex(dp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
           complex(dp), intent(out) :: work(*)
           complex(dp), intent(inout) :: x1(*), x2(*)
        ! =====================================================================
           ! Parameters 
           real(dp), parameter :: alphasq = 0.01_dp
           
           
           
           ! Local Scalars 
           integer(${ik}$) :: i
           real(dp) :: normsq1, normsq2, scl1, scl2, ssq1, ssq2
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           if( m1 < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( m2 < 0_${ik}$ ) then
              info = -2_${ik}$
           else if( n < 0_${ik}$ ) then
              info = -3_${ik}$
           else if( incx1 < 1_${ik}$ ) then
              info = -5_${ik}$
           else if( incx2 < 1_${ik}$ ) then
              info = -7_${ik}$
           else if( ldq1 < max( 1_${ik}$, m1 ) ) then
              info = -9_${ik}$
           else if( ldq2 < max( 1_${ik}$, m2 ) ) then
              info = -11_${ik}$
           else if( lwork < n ) then
              info = -13_${ik}$
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNBDB6', -info )
              return
           end if
           ! first, project x onto the orthogonal complement of q's column
           ! space
           scl1 = zero
           ssq1 = one
           call stdlib${ii}$_zlassq( m1, x1, incx1, scl1, ssq1 )
           scl2 = zero
           ssq2 = one
           call stdlib${ii}$_zlassq( m2, x2, incx2, scl2, ssq2 )
           normsq1 = scl1**2_${ik}$*ssq1 + scl2**2_${ik}$*ssq2
           if( m1 == 0_${ik}$ ) then
              do i = 1, n
                 work(i) = czero
              end do
           else
              call stdlib${ii}$_zgemv( 'C', m1, n, cone, q1, ldq1, x1, incx1, czero, work,1_${ik}$ )
           end if
           call stdlib${ii}$_zgemv( 'C', m2, n, cone, q2, ldq2, x2, incx2, cone, work, 1_${ik}$ )
           call stdlib${ii}$_zgemv( 'N', m1, n, cnegone, q1, ldq1, work, 1_${ik}$, cone, x1,incx1 )
           call stdlib${ii}$_zgemv( 'N', m2, n, cnegone, q2, ldq2, work, 1_${ik}$, cone, x2,incx2 )
           scl1 = zero
           ssq1 = one
           call stdlib${ii}$_zlassq( m1, x1, incx1, scl1, ssq1 )
           scl2 = zero
           ssq2 = one
           call stdlib${ii}$_zlassq( m2, x2, incx2, scl2, ssq2 )
           normsq2 = scl1**2_${ik}$*ssq1 + scl2**2_${ik}$*ssq2
           ! if projection is sufficiently large in norm, then stop.
           ! if projection is czero, then stop.
           ! otherwise, project again.
           if( normsq2 >= alphasq*normsq1 ) then
              return
           end if
           if( normsq2 == czero ) then
              return
           end if
           normsq1 = normsq2
           do i = 1, n
              work(i) = czero
           end do
           if( m1 == 0_${ik}$ ) then
              do i = 1, n
                 work(i) = czero
              end do
           else
              call stdlib${ii}$_zgemv( 'C', m1, n, cone, q1, ldq1, x1, incx1, czero, work,1_${ik}$ )
           end if
           call stdlib${ii}$_zgemv( 'C', m2, n, cone, q2, ldq2, x2, incx2, cone, work, 1_${ik}$ )
           call stdlib${ii}$_zgemv( 'N', m1, n, cnegone, q1, ldq1, work, 1_${ik}$, cone, x1,incx1 )
           call stdlib${ii}$_zgemv( 'N', m2, n, cnegone, q2, ldq2, work, 1_${ik}$, cone, x2,incx2 )
           scl1 = zero
           ssq1 = one
           call stdlib${ii}$_zlassq( m1, x1, incx1, scl1, ssq1 )
           scl2 = zero
           ssq2 = one
           call stdlib${ii}$_zlassq( m1, x1, incx1, scl1, ssq1 )
           normsq2 = scl1**2_${ik}$*ssq1 + scl2**2_${ik}$*ssq2
           ! if second projection is sufficiently large in norm, then do
           ! nothing more. alternatively, if it shrunk significantly, then
           ! truncate it to czero.
           if( normsq2 < alphasq*normsq1 ) then
              do i = 1, m1
                 x1(i) = czero
              end do
              do i = 1, m2
                 x2(i) = czero
              end do
           end if
           return
     end subroutine stdlib${ii}$_zunbdb6

#:for ck,ct,ci in CMPLX_KINDS_TYPES
#:if not ck in ["sp","dp"]
     pure module subroutine stdlib${ii}$_${ci}$unbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
     !! ZUNBDB6: orthogonalizes the column vector
     !! X = [ X1 ]
     !! [ X2 ]
     !! with respect to the columns of
     !! Q = [ Q1 ] .
     !! [ Q2 ]
     !! The columns of Q must be orthonormal.
     !! If the projection is zero according to Kahan's "twice is enough"
     !! criterion, then the zero vector is returned.
               lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${ck}$, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           integer(${ik}$), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
           integer(${ik}$), intent(out) :: info
           ! Array Arguments 
           complex(${ck}$), intent(in) :: q1(ldq1,*), q2(ldq2,*)
           complex(${ck}$), intent(out) :: work(*)
           complex(${ck}$), intent(inout) :: x1(*), x2(*)
        ! =====================================================================
           ! Parameters 
           real(${ck}$), parameter :: alphasq = 0.01_${ck}$
           
           
           
           ! Local Scalars 
           integer(${ik}$) :: i
           real(${ck}$) :: normsq1, normsq2, scl1, scl2, ssq1, ssq2
           ! Intrinsic Function 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           if( m1 < 0_${ik}$ ) then
              info = -1_${ik}$
           else if( m2 < 0_${ik}$ ) then
              info = -2_${ik}$
           else if( n < 0_${ik}$ ) then
              info = -3_${ik}$
           else if( incx1 < 1_${ik}$ ) then
              info = -5_${ik}$
           else if( incx2 < 1_${ik}$ ) then
              info = -7_${ik}$
           else if( ldq1 < max( 1_${ik}$, m1 ) ) then
              info = -9_${ik}$
           else if( ldq2 < max( 1_${ik}$, m2 ) ) then
              info = -11_${ik}$
           else if( lwork < n ) then
              info = -13_${ik}$
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'ZUNBDB6', -info )
              return
           end if
           ! first, project x onto the orthogonal complement of q's column
           ! space
           scl1 = zero
           ssq1 = one
           call stdlib${ii}$_${ci}$lassq( m1, x1, incx1, scl1, ssq1 )
           scl2 = zero
           ssq2 = one
           call stdlib${ii}$_${ci}$lassq( m2, x2, incx2, scl2, ssq2 )
           normsq1 = scl1**2_${ik}$*ssq1 + scl2**2_${ik}$*ssq2
           if( m1 == 0_${ik}$ ) then
              do i = 1, n
                 work(i) = czero
              end do
           else
              call stdlib${ii}$_${ci}$gemv( 'C', m1, n, cone, q1, ldq1, x1, incx1, czero, work,1_${ik}$ )
           end if
           call stdlib${ii}$_${ci}$gemv( 'C', m2, n, cone, q2, ldq2, x2, incx2, cone, work, 1_${ik}$ )
           call stdlib${ii}$_${ci}$gemv( 'N', m1, n, cnegone, q1, ldq1, work, 1_${ik}$, cone, x1,incx1 )
           call stdlib${ii}$_${ci}$gemv( 'N', m2, n, cnegone, q2, ldq2, work, 1_${ik}$, cone, x2,incx2 )
           scl1 = zero
           ssq1 = one
           call stdlib${ii}$_${ci}$lassq( m1, x1, incx1, scl1, ssq1 )
           scl2 = zero
           ssq2 = one
           call stdlib${ii}$_${ci}$lassq( m2, x2, incx2, scl2, ssq2 )
           normsq2 = scl1**2_${ik}$*ssq1 + scl2**2_${ik}$*ssq2
           ! if projection is sufficiently large in norm, then stop.
           ! if projection is czero, then stop.
           ! otherwise, project again.
           if( normsq2 >= alphasq*normsq1 ) then
              return
           end if
           if( normsq2 == czero ) then
              return
           end if
           normsq1 = normsq2
           do i = 1, n
              work(i) = czero
           end do
           if( m1 == 0_${ik}$ ) then
              do i = 1, n
                 work(i) = czero
              end do
           else
              call stdlib${ii}$_${ci}$gemv( 'C', m1, n, cone, q1, ldq1, x1, incx1, czero, work,1_${ik}$ )
           end if
           call stdlib${ii}$_${ci}$gemv( 'C', m2, n, cone, q2, ldq2, x2, incx2, cone, work, 1_${ik}$ )
           call stdlib${ii}$_${ci}$gemv( 'N', m1, n, cnegone, q1, ldq1, work, 1_${ik}$, cone, x1,incx1 )
           call stdlib${ii}$_${ci}$gemv( 'N', m2, n, cnegone, q2, ldq2, work, 1_${ik}$, cone, x2,incx2 )
           scl1 = zero
           ssq1 = one
           call stdlib${ii}$_${ci}$lassq( m1, x1, incx1, scl1, ssq1 )
           scl2 = zero
           ssq2 = one
           call stdlib${ii}$_${ci}$lassq( m1, x1, incx1, scl1, ssq1 )
           normsq2 = scl1**2_${ik}$*ssq1 + scl2**2_${ik}$*ssq2
           ! if second projection is sufficiently large in norm, then do
           ! nothing more. alternatively, if it shrunk significantly, then
           ! truncate it to czero.
           if( normsq2 < alphasq*normsq1 ) then
              do i = 1, m1
                 x1(i) = czero
              end do
              do i = 1, m2
                 x2(i) = czero
              end do
           end if
           return
     end subroutine stdlib${ii}$_${ci}$unbdb6

#:endif
#:endfor


#:endfor
end submodule stdlib_lapack_cosine_sine