stdlib_lapack_cosine_sine.fypp Source File


Source Code

#:include "common.fypp" 
submodule(stdlib_lapack_eig_svd_lsq) stdlib_lapack_cosine_sine
  implicit none


  contains
#:for ik,it,ii in LINALG_INT_KINDS_TYPES

     pure module subroutine stdlib${ii}$_sbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
     !! SBBCSD computes the CS decomposition of an orthogonal matrix in
     !! bidiagonal-block form,
     !! [ B11 | B12 0  0 ]
     !! [  0  |  0 -I  0 ]
     !! X = [----------------]
     !! [ B21 | B22 0  0 ]
     !! [  0  |  0  0  I ]
     !! [  C | -S  0  0 ]
     !! [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**T
     !! = [---------] [---------------] [---------]   .
     !! [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
     !! [  0 |  0  0  I ]
     !! X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
     !! than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
     !! transposed and/or permuted. This can be done in constant time using
     !! the TRANS and SIGNS options. See SORCSD for details.)
     !! The bidiagonal matrices B11, B12, B21, and B22 are represented
     !! implicitly by angles THETA(1:Q) and PHI(1:Q-1).
     !! The orthogonal matrices U1, U2, V1T, and V2T are input/output.
     !! The input matrices are pre- or post-multiplied by the appropriate
     !! singular vector matrices.
     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, work, &
               lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, one, ten, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lwork, m, p, q
           ! Array Arguments 
           real(sp), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
                      b22e(*), work(*)
           real(sp), intent(inout) :: phi(*), theta(*)
           real(sp), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
        ! ===================================================================
           ! Parameters 
           integer(${ik}$), parameter :: maxitr = 6_${ik}$
           real(sp), parameter :: hundred = 100.0_sp
           real(sp), parameter :: meighth = -0.125_sp
           real(sp), parameter :: piover2 = 1.57079632679489661923132169163975144210_sp
           
           
           
           
           ! Local Scalars 
           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
                     wantu2, wantv1t, wantv2t
           integer(${ik}$) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
                     iv2tcs, iv2tsn, j, lworkmin, lworkopt, maxit, mini
           real(sp) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
           ! Intrinsic Functions 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           if( m < 0_${ik}$ ) then
              info = -6_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -7_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -8_${ik}$
           else if( q > p .or. q > m-p .or. q > m-q ) then
              info = -8_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -12_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -14_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -16_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -18_${ik}$
           end if
           ! quick return if q = 0
           if( info == 0_${ik}$ .and. q == 0_${ik}$ ) then
              lworkmin = 1_${ik}$
              work(1_${ik}$) = lworkmin
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              iu1cs = 1_${ik}$
              iu1sn = iu1cs + q
              iu2cs = iu1sn + q
              iu2sn = iu2cs + q
              iv1tcs = iu2sn + q
              iv1tsn = iv1tcs + q
              iv2tcs = iv1tsn + q
              iv2tsn = iv2tcs + q
              lworkopt = iv2tsn + q - 1_${ik}$
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not. lquery ) then
                 info = -28_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'SBBCSD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! get machine constants
           eps = stdlib${ii}$_slamch( 'EPSILON' )
           unfl = stdlib${ii}$_slamch( 'SAFE MINIMUM' )
           tolmul = max( ten, min( hundred, eps**meighth ) )
           tol = tolmul*eps
           thresh = max( tol, maxitr*q*q*unfl )
           ! test for negligible sines or cosines
           do i = 1, q
              if( theta(i) < thresh ) then
                 theta(i) = zero
              else if( theta(i) > piover2-thresh ) then
                 theta(i) = piover2
              end if
           end do
           do i = 1, q-1
              if( phi(i) < thresh ) then
                 phi(i) = zero
              else if( phi(i) > piover2-thresh ) then
                 phi(i) = piover2
              end if
           end do
           ! initial deflation
           imax = q
           do while( imax > 1 )
              if( phi(imax-1) /= zero ) then
                 exit
              end if
              imax = imax - 1_${ik}$
           end do
           imin = imax - 1_${ik}$
           if  ( imin > 1_${ik}$ ) then
              do while( phi(imin-1) /= zero )
                 imin = imin - 1_${ik}$
                 if  ( imin <= 1 ) exit
              end do
           end if
           ! initialize iteration counter
           maxit = maxitr*q*q
           iter = 0_${ik}$
           ! begin main iteration loop
           do while( imax > 1 )
              ! compute the matrix entries
              b11d(imin) = cos( theta(imin) )
              b21d(imin) = -sin( theta(imin) )
              do i = imin, imax - 1
                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
              end do
              b12d(imax) = sin( theta(imax) )
              b22d(imax) = cos( theta(imax) )
              ! abort if not converging; otherwise, increment iter
              if( iter > maxit ) then
                 info = 0_${ik}$
                 do i = 1, q
                    if( phi(i) /= zero )info = info + 1_${ik}$
                 end do
                 return
              end if
              iter = iter + imax - imin
              ! compute shifts
              thetamax = theta(imin)
              thetamin = theta(imin)
              do i = imin+1, imax
                 if( theta(i) > thetamax )thetamax = theta(i)
                 if( theta(i) < thetamin )thetamin = theta(i)
              end do
              if( thetamax > piover2 - thresh ) then
                 ! zero on diagonals of b11 and b22; induce deflation with a
                 ! zero shift
                 mu = zero
                 nu = one
              else if( thetamin < thresh ) then
                 ! zero on diagonals of b12 and b22; induce deflation with a
                 ! zero shift
                 mu = one
                 nu = zero
              else
                 ! compute shifts for b11 and b21 and use the lesser
                 call stdlib${ii}$_slas2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
                           
                 call stdlib${ii}$_slas2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
                           
                 if( sigma11 <= sigma21 ) then
                    mu = sigma11
                    nu = sqrt( one - mu**2_${ik}$ )
                    if( mu < thresh ) then
                       mu = zero
                       nu = one
                    end if
                 else
                    nu = sigma21
                    mu = sqrt( one - nu**2_${ik}$ )
                    if( nu < thresh ) then
                       mu = one
                       nu = zero
                    end if
                 end if
              end if
              ! rotate to produce bulges in b11 and b21
              if( mu <= nu ) then
                 call stdlib${ii}$_slartgs( b11d(imin), b11e(imin), mu,work(iv1tcs+imin-1), work(iv1tsn+&
                           imin-1) )
              else
                 call stdlib${ii}$_slartgs( b21d(imin), b21e(imin), nu,work(iv1tcs+imin-1), work(iv1tsn+&
                           imin-1) )
              end if
              temp = work(iv1tcs+imin-1)*b11d(imin) +work(iv1tsn+imin-1)*b11e(imin)
              b11e(imin) = work(iv1tcs+imin-1)*b11e(imin) -work(iv1tsn+imin-1)*b11d(imin)
              b11d(imin) = temp
              b11bulge = work(iv1tsn+imin-1)*b11d(imin+1)
              b11d(imin+1) = work(iv1tcs+imin-1)*b11d(imin+1)
              temp = work(iv1tcs+imin-1)*b21d(imin) +work(iv1tsn+imin-1)*b21e(imin)
              b21e(imin) = work(iv1tcs+imin-1)*b21e(imin) -work(iv1tsn+imin-1)*b21d(imin)
              b21d(imin) = temp
              b21bulge = work(iv1tsn+imin-1)*b21d(imin+1)
              b21d(imin+1) = work(iv1tcs+imin-1)*b21d(imin+1)
              ! compute theta(imin)
              theta( imin ) = atan2( sqrt( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ ),sqrt( b11d(imin)**2_${ik}$+&
                        b11bulge**2_${ik}$ ) )
              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
              if( b11d(imin)**2_${ik}$+b11bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_slartgp( b11bulge, b11d(imin), work(iu1sn+imin-1),work(iu1cs+imin-1),&
                            r )
              else if( mu <= nu ) then
                 call stdlib${ii}$_slartgs( b11e( imin ), b11d( imin + 1_${ik}$ ), mu,work(iu1cs+imin-1), work(&
                           iu1sn+imin-1) )
              else
                 call stdlib${ii}$_slartgs( b12d( imin ), b12e( imin ), nu,work(iu1cs+imin-1), work(&
                           iu1sn+imin-1) )
              end if
              if( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_slartgp( b21bulge, b21d(imin), work(iu2sn+imin-1),work(iu2cs+imin-1),&
                            r )
              else if( nu < mu ) then
                 call stdlib${ii}$_slartgs( b21e( imin ), b21d( imin + 1_${ik}$ ), nu,work(iu2cs+imin-1), work(&
                           iu2sn+imin-1) )
              else
                 call stdlib${ii}$_slartgs( b22d(imin), b22e(imin), mu,work(iu2cs+imin-1), work(iu2sn+&
                           imin-1) )
              end if
              work(iu2cs+imin-1) = -work(iu2cs+imin-1)
              work(iu2sn+imin-1) = -work(iu2sn+imin-1)
              temp = work(iu1cs+imin-1)*b11e(imin) +work(iu1sn+imin-1)*b11d(imin+1)
              b11d(imin+1) = work(iu1cs+imin-1)*b11d(imin+1) -work(iu1sn+imin-1)*b11e(imin)
                        
              b11e(imin) = temp
              if( imax > imin+1 ) then
                 b11bulge = work(iu1sn+imin-1)*b11e(imin+1)
                 b11e(imin+1) = work(iu1cs+imin-1)*b11e(imin+1)
              end if
              temp = work(iu1cs+imin-1)*b12d(imin) +work(iu1sn+imin-1)*b12e(imin)
              b12e(imin) = work(iu1cs+imin-1)*b12e(imin) -work(iu1sn+imin-1)*b12d(imin)
              b12d(imin) = temp
              b12bulge = work(iu1sn+imin-1)*b12d(imin+1)
              b12d(imin+1) = work(iu1cs+imin-1)*b12d(imin+1)
              temp = work(iu2cs+imin-1)*b21e(imin) +work(iu2sn+imin-1)*b21d(imin+1)
              b21d(imin+1) = work(iu2cs+imin-1)*b21d(imin+1) -work(iu2sn+imin-1)*b21e(imin)
                        
              b21e(imin) = temp
              if( imax > imin+1 ) then
                 b21bulge = work(iu2sn+imin-1)*b21e(imin+1)
                 b21e(imin+1) = work(iu2cs+imin-1)*b21e(imin+1)
              end if
              temp = work(iu2cs+imin-1)*b22d(imin) +work(iu2sn+imin-1)*b22e(imin)
              b22e(imin) = work(iu2cs+imin-1)*b22e(imin) -work(iu2sn+imin-1)*b22d(imin)
              b22d(imin) = temp
              b22bulge = work(iu2sn+imin-1)*b22d(imin+1)
              b22d(imin+1) = work(iu2cs+imin-1)*b22d(imin+1)
              ! inner loop: chase bulges from b11(imin,imin+2),
              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
              ! bottom-right
              do i = imin+1, imax-1
                 ! compute phi(i-1)
                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
                 phi(i-1) = atan2( sqrt(x1**2_${ik}$+x2**2_${ik}$), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 = b11e(i-1)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 = b21e(i-1)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12d(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22d(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_slartgp( x2, x1, work(iv1tsn+i-1), work(iv1tcs+i-1),r )
                 else if( .not. restart11 .and. restart21 ) then
                    call stdlib${ii}$_slartgp( b11bulge, b11e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
                              r )
                 else if( restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_slartgp( b21bulge, b21e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
                              r )
                 else if( mu <= nu ) then
                    call stdlib${ii}$_slartgs( b11d(i), b11e(i), mu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
                              
                 else
                    call stdlib${ii}$_slartgs( b21d(i), b21e(i), nu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
                              
                 end if
                 work(iv1tcs+i-1) = -work(iv1tcs+i-1)
                 work(iv1tsn+i-1) = -work(iv1tsn+i-1)
                 if( .not. restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( y2, y1, work(iv2tsn+i-1-1),work(iv2tcs+i-1-1), r )
                              
                 else if( .not. restart12 .and. restart22 ) then
                    call stdlib${ii}$_slartgp( b12bulge, b12d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
                              1_${ik}$), r )
                 else if( restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( b22bulge, b22d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
                              1_${ik}$), r )
                 else if( nu < mu ) then
                    call stdlib${ii}$_slartgs( b12e(i-1), b12d(i), nu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
                              1_${ik}$-1) )
                 else
                    call stdlib${ii}$_slartgs( b22e(i-1), b22d(i), mu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
                              1_${ik}$-1) )
                 end if
                 temp = work(iv1tcs+i-1)*b11d(i) + work(iv1tsn+i-1)*b11e(i)
                 b11e(i) = work(iv1tcs+i-1)*b11e(i) -work(iv1tsn+i-1)*b11d(i)
                 b11d(i) = temp
                 b11bulge = work(iv1tsn+i-1)*b11d(i+1)
                 b11d(i+1) = work(iv1tcs+i-1)*b11d(i+1)
                 temp = work(iv1tcs+i-1)*b21d(i) + work(iv1tsn+i-1)*b21e(i)
                 b21e(i) = work(iv1tcs+i-1)*b21e(i) -work(iv1tsn+i-1)*b21d(i)
                 b21d(i) = temp
                 b21bulge = work(iv1tsn+i-1)*b21d(i+1)
                 b21d(i+1) = work(iv1tcs+i-1)*b21d(i+1)
                 temp = work(iv2tcs+i-1-1)*b12e(i-1) +work(iv2tsn+i-1-1)*b12d(i)
                 b12d(i) = work(iv2tcs+i-1-1)*b12d(i) -work(iv2tsn+i-1-1)*b12e(i-1)
                 b12e(i-1) = temp
                 b12bulge = work(iv2tsn+i-1-1)*b12e(i)
                 b12e(i) = work(iv2tcs+i-1-1)*b12e(i)
                 temp = work(iv2tcs+i-1-1)*b22e(i-1) +work(iv2tsn+i-1-1)*b22d(i)
                 b22d(i) = work(iv2tcs+i-1-1)*b22d(i) -work(iv2tsn+i-1-1)*b22e(i-1)
                 b22e(i-1) = temp
                 b22bulge = work(iv2tsn+i-1-1)*b22e(i)
                 b22e(i) = work(iv2tcs+i-1-1)*b22e(i)
                 ! compute theta(i)
                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
                 theta(i) = atan2( sqrt(y1**2_${ik}$+y2**2_${ik}$), sqrt(x1**2_${ik}$+x2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 =   b11d(i)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12e(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 =   b21d(i)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22e(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_slartgp( x2, x1, work(iu1sn+i-1), work(iu1cs+i-1),r )
                 else if( .not. restart11 .and. restart12 ) then
                    call stdlib${ii}$_slartgp( b11bulge, b11d(i), work(iu1sn+i-1),work(iu1cs+i-1), r )
                              
                 else if( restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_slartgp( b12bulge, b12e(i-1), work(iu1sn+i-1),work(iu1cs+i-1), r )
                              
                 else if( mu <= nu ) then
                    call stdlib${ii}$_slartgs( b11e(i), b11d(i+1), mu, work(iu1cs+i-1),work(iu1sn+i-1) )
                              
                 else
                    call stdlib${ii}$_slartgs( b12d(i), b12e(i), nu, work(iu1cs+i-1),work(iu1sn+i-1) )
                              
                 end if
                 if( .not. restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( y2, y1, work(iu2sn+i-1), work(iu2cs+i-1),r )
                 else if( .not. restart21 .and. restart22 ) then
                    call stdlib${ii}$_slartgp( b21bulge, b21d(i), work(iu2sn+i-1),work(iu2cs+i-1), r )
                              
                 else if( restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( b22bulge, b22e(i-1), work(iu2sn+i-1),work(iu2cs+i-1), r )
                              
                 else if( nu < mu ) then
                    call stdlib${ii}$_slartgs( b21e(i), b21e(i+1), nu, work(iu2cs+i-1),work(iu2sn+i-1) )
                              
                 else
                    call stdlib${ii}$_slartgs( b22d(i), b22e(i), mu, work(iu2cs+i-1),work(iu2sn+i-1) )
                              
                 end if
                 work(iu2cs+i-1) = -work(iu2cs+i-1)
                 work(iu2sn+i-1) = -work(iu2sn+i-1)
                 temp = work(iu1cs+i-1)*b11e(i) + work(iu1sn+i-1)*b11d(i+1)
                 b11d(i+1) = work(iu1cs+i-1)*b11d(i+1) -work(iu1sn+i-1)*b11e(i)
                 b11e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b11bulge = work(iu1sn+i-1)*b11e(i+1)
                    b11e(i+1) = work(iu1cs+i-1)*b11e(i+1)
                 end if
                 temp = work(iu2cs+i-1)*b21e(i) + work(iu2sn+i-1)*b21d(i+1)
                 b21d(i+1) = work(iu2cs+i-1)*b21d(i+1) -work(iu2sn+i-1)*b21e(i)
                 b21e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b21bulge = work(iu2sn+i-1)*b21e(i+1)
                    b21e(i+1) = work(iu2cs+i-1)*b21e(i+1)
                 end if
                 temp = work(iu1cs+i-1)*b12d(i) + work(iu1sn+i-1)*b12e(i)
                 b12e(i) = work(iu1cs+i-1)*b12e(i) - work(iu1sn+i-1)*b12d(i)
                 b12d(i) = temp
                 b12bulge = work(iu1sn+i-1)*b12d(i+1)
                 b12d(i+1) = work(iu1cs+i-1)*b12d(i+1)
                 temp = work(iu2cs+i-1)*b22d(i) + work(iu2sn+i-1)*b22e(i)
                 b22e(i) = work(iu2cs+i-1)*b22e(i) - work(iu2sn+i-1)*b22d(i)
                 b22d(i) = temp
                 b22bulge = work(iu2sn+i-1)*b22d(i+1)
                 b22d(i+1) = work(iu2cs+i-1)*b22d(i+1)
              end do
              ! compute phi(imax-1)
              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
              phi(imax-1) = atan2( abs(x1), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
              restart12 = b12d(imax-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
              restart22 = b22d(imax-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
              if( .not. restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_slartgp( y2, y1, work(iv2tsn+imax-1-1),work(iv2tcs+imax-1-1), r )
                           
              else if( .not. restart12 .and. restart22 ) then
                 call stdlib${ii}$_slartgp( b12bulge, b12d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
                           imax-1-1), r )
              else if( restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_slartgp( b22bulge, b22d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
                           imax-1-1), r )
              else if( nu < mu ) then
                 call stdlib${ii}$_slartgs( b12e(imax-1), b12d(imax), nu,work(iv2tcs+imax-1-1), work(&
                           iv2tsn+imax-1-1) )
              else
                 call stdlib${ii}$_slartgs( b22e(imax-1), b22d(imax), mu,work(iv2tcs+imax-1-1), work(&
                           iv2tsn+imax-1-1) )
              end if
              temp = work(iv2tcs+imax-1-1)*b12e(imax-1) +work(iv2tsn+imax-1-1)*b12d(imax)
              b12d(imax) = work(iv2tcs+imax-1-1)*b12d(imax) -work(iv2tsn+imax-1-1)*b12e(imax-1)
                        
              b12e(imax-1) = temp
              temp = work(iv2tcs+imax-1-1)*b22e(imax-1) +work(iv2tsn+imax-1-1)*b22d(imax)
              b22d(imax) = work(iv2tcs+imax-1-1)*b22d(imax) -work(iv2tsn+imax-1-1)*b22e(imax-1)
                        
              b22e(imax-1) = temp
              ! update singular vectors
              if( wantu1 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_slasr( 'R', 'V', 'F', p, imax-imin+1,work(iu1cs+imin-1), work(&
                              iu1sn+imin-1),u1(1_${ik}$,imin), ldu1 )
                 else
                    call stdlib${ii}$_slasr( 'L', 'V', 'F', imax-imin+1, p,work(iu1cs+imin-1), work(&
                              iu1sn+imin-1),u1(imin,1_${ik}$), ldu1 )
                 end if
              end if
              if( wantu2 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_slasr( 'R', 'V', 'F', m-p, imax-imin+1,work(iu2cs+imin-1), work(&
                              iu2sn+imin-1),u2(1_${ik}$,imin), ldu2 )
                 else
                    call stdlib${ii}$_slasr( 'L', 'V', 'F', imax-imin+1, m-p,work(iu2cs+imin-1), work(&
                              iu2sn+imin-1),u2(imin,1_${ik}$), ldu2 )
                 end if
              end if
              if( wantv1t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_slasr( 'L', 'V', 'F', imax-imin+1, q,work(iv1tcs+imin-1), work(&
                              iv1tsn+imin-1),v1t(imin,1_${ik}$), ldv1t )
                 else
                    call stdlib${ii}$_slasr( 'R', 'V', 'F', q, imax-imin+1,work(iv1tcs+imin-1), work(&
                              iv1tsn+imin-1),v1t(1_${ik}$,imin), ldv1t )
                 end if
              end if
              if( wantv2t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_slasr( 'L', 'V', 'F', imax-imin+1, m-q,work(iv2tcs+imin-1), work(&
                              iv2tsn+imin-1),v2t(imin,1_${ik}$), ldv2t )
                 else
                    call stdlib${ii}$_slasr( 'R', 'V', 'F', m-q, imax-imin+1,work(iv2tcs+imin-1), work(&
                              iv2tsn+imin-1),v2t(1_${ik}$,imin), ldv2t )
                 end if
              end if
              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
              if( b11e(imax-1)+b21e(imax-1) > 0_${ik}$ ) then
                 b11d(imax) = -b11d(imax)
                 b21d(imax) = -b21d(imax)
                 if( wantv1t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_sscal( q, negone, v1t(imax,1_${ik}$), ldv1t )
                    else
                       call stdlib${ii}$_sscal( q, negone, v1t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! compute theta(imax)
              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
              theta(imax) = atan2( abs(y1), abs(x1) )
              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
              ! and b22(imax,imax-1)
              if( b11d(imax)+b12e(imax-1) < 0_${ik}$ ) then
                 b12d(imax) = -b12d(imax)
                 if( wantu1 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_sscal( p, negone, u1(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_sscal( p, negone, u1(imax,1_${ik}$), ldu1 )
                    end if
                 end if
              end if
              if( b21d(imax)+b22e(imax-1) > 0_${ik}$ ) then
                 b22d(imax) = -b22d(imax)
                 if( wantu2 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_sscal( m-p, negone, u2(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_sscal( m-p, negone, u2(imax,1_${ik}$), ldu2 )
                    end if
                 end if
              end if
              ! fix signs on b12(imax,imax) and b22(imax,imax)
              if( b12d(imax)+b22d(imax) < 0_${ik}$ ) then
                 if( wantv2t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_sscal( m-q, negone, v2t(imax,1_${ik}$), ldv2t )
                    else
                       call stdlib${ii}$_sscal( m-q, negone, v2t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! test for negligible sines or cosines
              do i = imin, imax
                 if( theta(i) < thresh ) then
                    theta(i) = zero
                 else if( theta(i) > piover2-thresh ) then
                    theta(i) = piover2
                 end if
              end do
              do i = imin, imax-1
                 if( phi(i) < thresh ) then
                    phi(i) = zero
                 else if( phi(i) > piover2-thresh ) then
                    phi(i) = piover2
                 end if
              end do
              ! deflate
              if (imax > 1_${ik}$) then
                 do while( phi(imax-1) == zero )
                    imax = imax - 1_${ik}$
                    if (imax <= 1) exit
                 end do
              end if
              if( imin > imax - 1_${ik}$ )imin = imax - 1_${ik}$
              if (imin > 1_${ik}$) then
                 do while (phi(imin-1) /= zero)
                     imin = imin - 1_${ik}$
                     if (imin <= 1) exit
                 end do
              end if
              ! repeat main iteration loop
           end do
           ! postprocessing: order theta from least to greatest
           do i = 1, q
              mini = i
              thetamin = theta(i)
              do j = i+1, q
                 if( theta(j) < thetamin ) then
                    mini = j
                    thetamin = theta(j)
                 end if
              end do
              if( mini /= i ) then
                 theta(mini) = theta(i)
                 theta(i) = thetamin
                 if( colmajor ) then
                    if( wantu1 )call stdlib${ii}$_sswap( p, u1(1_${ik}$,i), 1_${ik}$, u1(1_${ik}$,mini), 1_${ik}$ )
                    if( wantu2 )call stdlib${ii}$_sswap( m-p, u2(1_${ik}$,i), 1_${ik}$, u2(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv1t )call stdlib${ii}$_sswap( q, v1t(i,1_${ik}$), ldv1t, v1t(mini,1_${ik}$), ldv1t )
                              
                    if( wantv2t )call stdlib${ii}$_sswap( m-q, v2t(i,1_${ik}$), ldv2t, v2t(mini,1_${ik}$),ldv2t )
                              
                 else
                    if( wantu1 )call stdlib${ii}$_sswap( p, u1(i,1_${ik}$), ldu1, u1(mini,1_${ik}$), ldu1 )
                    if( wantu2 )call stdlib${ii}$_sswap( m-p, u2(i,1_${ik}$), ldu2, u2(mini,1_${ik}$), ldu2 )
                    if( wantv1t )call stdlib${ii}$_sswap( q, v1t(1_${ik}$,i), 1_${ik}$, v1t(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv2t )call stdlib${ii}$_sswap( m-q, v2t(1_${ik}$,i), 1_${ik}$, v2t(1_${ik}$,mini), 1_${ik}$ )
                 end if
              end if
           end do
           return
     end subroutine stdlib${ii}$_sbbcsd

     pure module subroutine stdlib${ii}$_dbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
     !! DBBCSD computes the CS decomposition of an orthogonal matrix in
     !! bidiagonal-block form,
     !! [ B11 | B12 0  0 ]
     !! [  0  |  0 -I  0 ]
     !! X = [----------------]
     !! [ B21 | B22 0  0 ]
     !! [  0  |  0  0  I ]
     !! [  C | -S  0  0 ]
     !! [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**T
     !! = [---------] [---------------] [---------]   .
     !! [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
     !! [  0 |  0  0  I ]
     !! X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
     !! than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
     !! transposed and/or permuted. This can be done in constant time using
     !! the TRANS and SIGNS options. See DORCSD for details.)
     !! The bidiagonal matrices B11, B12, B21, and B22 are represented
     !! implicitly by angles THETA(1:Q) and PHI(1:Q-1).
     !! The orthogonal matrices U1, U2, V1T, and V2T are input/output.
     !! The input matrices are pre- or post-multiplied by the appropriate
     !! singular vector matrices.
     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, work, &
               lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, one, ten, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lwork, m, p, q
           ! Array Arguments 
           real(dp), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
                      b22e(*), work(*)
           real(dp), intent(inout) :: phi(*), theta(*)
           real(dp), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
        ! ===================================================================
           ! Parameters 
           integer(${ik}$), parameter :: maxitr = 6_${ik}$
           real(dp), parameter :: hundred = 100.0_dp
           real(dp), parameter :: meighth = -0.125_dp
           real(dp), parameter :: piover2 = 1.57079632679489661923132169163975144210_dp
           
           
           
           
           ! Local Scalars 
           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
                     wantu2, wantv1t, wantv2t
           integer(${ik}$) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
                     iv2tcs, iv2tsn, j, lworkmin, lworkopt, maxit, mini
           real(dp) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
           ! Intrinsic Functions 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           if( m < 0_${ik}$ ) then
              info = -6_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -7_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -8_${ik}$
           else if( q > p .or. q > m-p .or. q > m-q ) then
              info = -8_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -12_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -14_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -16_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -18_${ik}$
           end if
           ! quick return if q = 0
           if( info == 0_${ik}$ .and. q == 0_${ik}$ ) then
              lworkmin = 1_${ik}$
              work(1_${ik}$) = lworkmin
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              iu1cs = 1_${ik}$
              iu1sn = iu1cs + q
              iu2cs = iu1sn + q
              iu2sn = iu2cs + q
              iv1tcs = iu2sn + q
              iv1tsn = iv1tcs + q
              iv2tcs = iv1tsn + q
              iv2tsn = iv2tcs + q
              lworkopt = iv2tsn + q - 1_${ik}$
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not. lquery ) then
                 info = -28_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'DBBCSD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! get machine constants
           eps = stdlib${ii}$_dlamch( 'EPSILON' )
           unfl = stdlib${ii}$_dlamch( 'SAFE MINIMUM' )
           tolmul = max( ten, min( hundred, eps**meighth ) )
           tol = tolmul*eps
           thresh = max( tol, maxitr*q*q*unfl )
           ! test for negligible sines or cosines
           do i = 1, q
              if( theta(i) < thresh ) then
                 theta(i) = zero
              else if( theta(i) > piover2-thresh ) then
                 theta(i) = piover2
              end if
           end do
           do i = 1, q-1
              if( phi(i) < thresh ) then
                 phi(i) = zero
              else if( phi(i) > piover2-thresh ) then
                 phi(i) = piover2
              end if
           end do
           ! initial deflation
           imax = q
           do while( imax > 1 )
              if( phi(imax-1) /= zero ) then
                 exit
              end if
              imax = imax - 1_${ik}$
           end do
           imin = imax - 1_${ik}$
           if  ( imin > 1_${ik}$ ) then
              do while( phi(imin-1) /= zero )
                 imin = imin - 1_${ik}$
                 if  ( imin <= 1 ) exit
              end do
           end if
           ! initialize iteration counter
           maxit = maxitr*q*q
           iter = 0_${ik}$
           ! begin main iteration loop
           do while( imax > 1 )
              ! compute the matrix entries
              b11d(imin) = cos( theta(imin) )
              b21d(imin) = -sin( theta(imin) )
              do i = imin, imax - 1
                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
              end do
              b12d(imax) = sin( theta(imax) )
              b22d(imax) = cos( theta(imax) )
              ! abort if not converging; otherwise, increment iter
              if( iter > maxit ) then
                 info = 0_${ik}$
                 do i = 1, q
                    if( phi(i) /= zero )info = info + 1_${ik}$
                 end do
                 return
              end if
              iter = iter + imax - imin
              ! compute shifts
              thetamax = theta(imin)
              thetamin = theta(imin)
              do i = imin+1, imax
                 if( theta(i) > thetamax )thetamax = theta(i)
                 if( theta(i) < thetamin )thetamin = theta(i)
              end do
              if( thetamax > piover2 - thresh ) then
                 ! zero on diagonals of b11 and b22; induce deflation with a
                 ! zero shift
                 mu = zero
                 nu = one
              else if( thetamin < thresh ) then
                 ! zero on diagonals of b12 and b22; induce deflation with a
                 ! zero shift
                 mu = one
                 nu = zero
              else
                 ! compute shifts for b11 and b21 and use the lesser
                 call stdlib${ii}$_dlas2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
                           
                 call stdlib${ii}$_dlas2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
                           
                 if( sigma11 <= sigma21 ) then
                    mu = sigma11
                    nu = sqrt( one - mu**2_${ik}$ )
                    if( mu < thresh ) then
                       mu = zero
                       nu = one
                    end if
                 else
                    nu = sigma21
                    mu = sqrt( one - nu**2_${ik}$ )
                    if( nu < thresh ) then
                       mu = one
                       nu = zero
                    end if
                 end if
              end if
              ! rotate to produce bulges in b11 and b21
              if( mu <= nu ) then
                 call stdlib${ii}$_dlartgs( b11d(imin), b11e(imin), mu,work(iv1tcs+imin-1), work(iv1tsn+&
                           imin-1) )
              else
                 call stdlib${ii}$_dlartgs( b21d(imin), b21e(imin), nu,work(iv1tcs+imin-1), work(iv1tsn+&
                           imin-1) )
              end if
              temp = work(iv1tcs+imin-1)*b11d(imin) +work(iv1tsn+imin-1)*b11e(imin)
              b11e(imin) = work(iv1tcs+imin-1)*b11e(imin) -work(iv1tsn+imin-1)*b11d(imin)
              b11d(imin) = temp
              b11bulge = work(iv1tsn+imin-1)*b11d(imin+1)
              b11d(imin+1) = work(iv1tcs+imin-1)*b11d(imin+1)
              temp = work(iv1tcs+imin-1)*b21d(imin) +work(iv1tsn+imin-1)*b21e(imin)
              b21e(imin) = work(iv1tcs+imin-1)*b21e(imin) -work(iv1tsn+imin-1)*b21d(imin)
              b21d(imin) = temp
              b21bulge = work(iv1tsn+imin-1)*b21d(imin+1)
              b21d(imin+1) = work(iv1tcs+imin-1)*b21d(imin+1)
              ! compute theta(imin)
              theta( imin ) = atan2( sqrt( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ ),sqrt( b11d(imin)**2_${ik}$+&
                        b11bulge**2_${ik}$ ) )
              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
              if( b11d(imin)**2_${ik}$+b11bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_dlartgp( b11bulge, b11d(imin), work(iu1sn+imin-1),work(iu1cs+imin-1),&
                            r )
              else if( mu <= nu ) then
                 call stdlib${ii}$_dlartgs( b11e( imin ), b11d( imin + 1_${ik}$ ), mu,work(iu1cs+imin-1), work(&
                           iu1sn+imin-1) )
              else
                 call stdlib${ii}$_dlartgs( b12d( imin ), b12e( imin ), nu,work(iu1cs+imin-1), work(&
                           iu1sn+imin-1) )
              end if
              if( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_dlartgp( b21bulge, b21d(imin), work(iu2sn+imin-1),work(iu2cs+imin-1),&
                            r )
              else if( nu < mu ) then
                 call stdlib${ii}$_dlartgs( b21e( imin ), b21d( imin + 1_${ik}$ ), nu,work(iu2cs+imin-1), work(&
                           iu2sn+imin-1) )
              else
                 call stdlib${ii}$_dlartgs( b22d(imin), b22e(imin), mu,work(iu2cs+imin-1), work(iu2sn+&
                           imin-1) )
              end if
              work(iu2cs+imin-1) = -work(iu2cs+imin-1)
              work(iu2sn+imin-1) = -work(iu2sn+imin-1)
              temp = work(iu1cs+imin-1)*b11e(imin) +work(iu1sn+imin-1)*b11d(imin+1)
              b11d(imin+1) = work(iu1cs+imin-1)*b11d(imin+1) -work(iu1sn+imin-1)*b11e(imin)
                        
              b11e(imin) = temp
              if( imax > imin+1 ) then
                 b11bulge = work(iu1sn+imin-1)*b11e(imin+1)
                 b11e(imin+1) = work(iu1cs+imin-1)*b11e(imin+1)
              end if
              temp = work(iu1cs+imin-1)*b12d(imin) +work(iu1sn+imin-1)*b12e(imin)
              b12e(imin) = work(iu1cs+imin-1)*b12e(imin) -work(iu1sn+imin-1)*b12d(imin)
              b12d(imin) = temp
              b12bulge = work(iu1sn+imin-1)*b12d(imin+1)
              b12d(imin+1) = work(iu1cs+imin-1)*b12d(imin+1)
              temp = work(iu2cs+imin-1)*b21e(imin) +work(iu2sn+imin-1)*b21d(imin+1)
              b21d(imin+1) = work(iu2cs+imin-1)*b21d(imin+1) -work(iu2sn+imin-1)*b21e(imin)
                        
              b21e(imin) = temp
              if( imax > imin+1 ) then
                 b21bulge = work(iu2sn+imin-1)*b21e(imin+1)
                 b21e(imin+1) = work(iu2cs+imin-1)*b21e(imin+1)
              end if
              temp = work(iu2cs+imin-1)*b22d(imin) +work(iu2sn+imin-1)*b22e(imin)
              b22e(imin) = work(iu2cs+imin-1)*b22e(imin) -work(iu2sn+imin-1)*b22d(imin)
              b22d(imin) = temp
              b22bulge = work(iu2sn+imin-1)*b22d(imin+1)
              b22d(imin+1) = work(iu2cs+imin-1)*b22d(imin+1)
              ! inner loop: chase bulges from b11(imin,imin+2),
              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
              ! bottom-right
              do i = imin+1, imax-1
                 ! compute phi(i-1)
                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
                 phi(i-1) = atan2( sqrt(x1**2_${ik}$+x2**2_${ik}$), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 = b11e(i-1)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 = b21e(i-1)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12d(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22d(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_dlartgp( x2, x1, work(iv1tsn+i-1), work(iv1tcs+i-1),r )
                 else if( .not. restart11 .and. restart21 ) then
                    call stdlib${ii}$_dlartgp( b11bulge, b11e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
                              r )
                 else if( restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_dlartgp( b21bulge, b21e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
                              r )
                 else if( mu <= nu ) then
                    call stdlib${ii}$_dlartgs( b11d(i), b11e(i), mu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
                              
                 else
                    call stdlib${ii}$_dlartgs( b21d(i), b21e(i), nu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
                              
                 end if
                 work(iv1tcs+i-1) = -work(iv1tcs+i-1)
                 work(iv1tsn+i-1) = -work(iv1tsn+i-1)
                 if( .not. restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_dlartgp( y2, y1, work(iv2tsn+i-1-1),work(iv2tcs+i-1-1), r )
                              
                 else if( .not. restart12 .and. restart22 ) then
                    call stdlib${ii}$_dlartgp( b12bulge, b12d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
                              1_${ik}$), r )
                 else if( restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_dlartgp( b22bulge, b22d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
                              1_${ik}$), r )
                 else if( nu < mu ) then
                    call stdlib${ii}$_dlartgs( b12e(i-1), b12d(i), nu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
                              1_${ik}$-1) )
                 else
                    call stdlib${ii}$_dlartgs( b22e(i-1), b22d(i), mu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
                              1_${ik}$-1) )
                 end if
                 temp = work(iv1tcs+i-1)*b11d(i) + work(iv1tsn+i-1)*b11e(i)
                 b11e(i) = work(iv1tcs+i-1)*b11e(i) -work(iv1tsn+i-1)*b11d(i)
                 b11d(i) = temp
                 b11bulge = work(iv1tsn+i-1)*b11d(i+1)
                 b11d(i+1) = work(iv1tcs+i-1)*b11d(i+1)
                 temp = work(iv1tcs+i-1)*b21d(i) + work(iv1tsn+i-1)*b21e(i)
                 b21e(i) = work(iv1tcs+i-1)*b21e(i) -work(iv1tsn+i-1)*b21d(i)
                 b21d(i) = temp
                 b21bulge = work(iv1tsn+i-1)*b21d(i+1)
                 b21d(i+1) = work(iv1tcs+i-1)*b21d(i+1)
                 temp = work(iv2tcs+i-1-1)*b12e(i-1) +work(iv2tsn+i-1-1)*b12d(i)
                 b12d(i) = work(iv2tcs+i-1-1)*b12d(i) -work(iv2tsn+i-1-1)*b12e(i-1)
                 b12e(i-1) = temp
                 b12bulge = work(iv2tsn+i-1-1)*b12e(i)
                 b12e(i) = work(iv2tcs+i-1-1)*b12e(i)
                 temp = work(iv2tcs+i-1-1)*b22e(i-1) +work(iv2tsn+i-1-1)*b22d(i)
                 b22d(i) = work(iv2tcs+i-1-1)*b22d(i) -work(iv2tsn+i-1-1)*b22e(i-1)
                 b22e(i-1) = temp
                 b22bulge = work(iv2tsn+i-1-1)*b22e(i)
                 b22e(i) = work(iv2tcs+i-1-1)*b22e(i)
                 ! compute theta(i)
                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
                 theta(i) = atan2( sqrt(y1**2_${ik}$+y2**2_${ik}$), sqrt(x1**2_${ik}$+x2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 =   b11d(i)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12e(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 =   b21d(i)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22e(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_dlartgp( x2, x1, work(iu1sn+i-1), work(iu1cs+i-1),r )
                 else if( .not. restart11 .and. restart12 ) then
                    call stdlib${ii}$_dlartgp( b11bulge, b11d(i), work(iu1sn+i-1),work(iu1cs+i-1), r )
                              
                 else if( restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_dlartgp( b12bulge, b12e(i-1), work(iu1sn+i-1),work(iu1cs+i-1), r )
                              
                 else if( mu <= nu ) then
                    call stdlib${ii}$_dlartgs( b11e(i), b11d(i+1), mu, work(iu1cs+i-1),work(iu1sn+i-1) )
                              
                 else
                    call stdlib${ii}$_dlartgs( b12d(i), b12e(i), nu, work(iu1cs+i-1),work(iu1sn+i-1) )
                              
                 end if
                 if( .not. restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_dlartgp( y2, y1, work(iu2sn+i-1), work(iu2cs+i-1),r )
                 else if( .not. restart21 .and. restart22 ) then
                    call stdlib${ii}$_dlartgp( b21bulge, b21d(i), work(iu2sn+i-1),work(iu2cs+i-1), r )
                              
                 else if( restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_dlartgp( b22bulge, b22e(i-1), work(iu2sn+i-1),work(iu2cs+i-1), r )
                              
                 else if( nu < mu ) then
                    call stdlib${ii}$_dlartgs( b21e(i), b21e(i+1), nu, work(iu2cs+i-1),work(iu2sn+i-1) )
                              
                 else
                    call stdlib${ii}$_dlartgs( b22d(i), b22e(i), mu, work(iu2cs+i-1),work(iu2sn+i-1) )
                              
                 end if
                 work(iu2cs+i-1) = -work(iu2cs+i-1)
                 work(iu2sn+i-1) = -work(iu2sn+i-1)
                 temp = work(iu1cs+i-1)*b11e(i) + work(iu1sn+i-1)*b11d(i+1)
                 b11d(i+1) = work(iu1cs+i-1)*b11d(i+1) -work(iu1sn+i-1)*b11e(i)
                 b11e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b11bulge = work(iu1sn+i-1)*b11e(i+1)
                    b11e(i+1) = work(iu1cs+i-1)*b11e(i+1)
                 end if
                 temp = work(iu2cs+i-1)*b21e(i) + work(iu2sn+i-1)*b21d(i+1)
                 b21d(i+1) = work(iu2cs+i-1)*b21d(i+1) -work(iu2sn+i-1)*b21e(i)
                 b21e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b21bulge = work(iu2sn+i-1)*b21e(i+1)
                    b21e(i+1) = work(iu2cs+i-1)*b21e(i+1)
                 end if
                 temp = work(iu1cs+i-1)*b12d(i) + work(iu1sn+i-1)*b12e(i)
                 b12e(i) = work(iu1cs+i-1)*b12e(i) - work(iu1sn+i-1)*b12d(i)
                 b12d(i) = temp
                 b12bulge = work(iu1sn+i-1)*b12d(i+1)
                 b12d(i+1) = work(iu1cs+i-1)*b12d(i+1)
                 temp = work(iu2cs+i-1)*b22d(i) + work(iu2sn+i-1)*b22e(i)
                 b22e(i) = work(iu2cs+i-1)*b22e(i) - work(iu2sn+i-1)*b22d(i)
                 b22d(i) = temp
                 b22bulge = work(iu2sn+i-1)*b22d(i+1)
                 b22d(i+1) = work(iu2cs+i-1)*b22d(i+1)
              end do
              ! compute phi(imax-1)
              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
              phi(imax-1) = atan2( abs(x1), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
              restart12 = b12d(imax-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
              restart22 = b22d(imax-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
              if( .not. restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_dlartgp( y2, y1, work(iv2tsn+imax-1-1),work(iv2tcs+imax-1-1), r )
                           
              else if( .not. restart12 .and. restart22 ) then
                 call stdlib${ii}$_dlartgp( b12bulge, b12d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
                           imax-1-1), r )
              else if( restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_dlartgp( b22bulge, b22d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
                           imax-1-1), r )
              else if( nu < mu ) then
                 call stdlib${ii}$_dlartgs( b12e(imax-1), b12d(imax), nu,work(iv2tcs+imax-1-1), work(&
                           iv2tsn+imax-1-1) )
              else
                 call stdlib${ii}$_dlartgs( b22e(imax-1), b22d(imax), mu,work(iv2tcs+imax-1-1), work(&
                           iv2tsn+imax-1-1) )
              end if
              temp = work(iv2tcs+imax-1-1)*b12e(imax-1) +work(iv2tsn+imax-1-1)*b12d(imax)
              b12d(imax) = work(iv2tcs+imax-1-1)*b12d(imax) -work(iv2tsn+imax-1-1)*b12e(imax-1)
                        
              b12e(imax-1) = temp
              temp = work(iv2tcs+imax-1-1)*b22e(imax-1) +work(iv2tsn+imax-1-1)*b22d(imax)
              b22d(imax) = work(iv2tcs+imax-1-1)*b22d(imax) -work(iv2tsn+imax-1-1)*b22e(imax-1)
                        
              b22e(imax-1) = temp
              ! update singular vectors
              if( wantu1 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_dlasr( 'R', 'V', 'F', p, imax-imin+1,work(iu1cs+imin-1), work(&
                              iu1sn+imin-1),u1(1_${ik}$,imin), ldu1 )
                 else
                    call stdlib${ii}$_dlasr( 'L', 'V', 'F', imax-imin+1, p,work(iu1cs+imin-1), work(&
                              iu1sn+imin-1),u1(imin,1_${ik}$), ldu1 )
                 end if
              end if
              if( wantu2 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_dlasr( 'R', 'V', 'F', m-p, imax-imin+1,work(iu2cs+imin-1), work(&
                              iu2sn+imin-1),u2(1_${ik}$,imin), ldu2 )
                 else
                    call stdlib${ii}$_dlasr( 'L', 'V', 'F', imax-imin+1, m-p,work(iu2cs+imin-1), work(&
                              iu2sn+imin-1),u2(imin,1_${ik}$), ldu2 )
                 end if
              end if
              if( wantv1t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_dlasr( 'L', 'V', 'F', imax-imin+1, q,work(iv1tcs+imin-1), work(&
                              iv1tsn+imin-1),v1t(imin,1_${ik}$), ldv1t )
                 else
                    call stdlib${ii}$_dlasr( 'R', 'V', 'F', q, imax-imin+1,work(iv1tcs+imin-1), work(&
                              iv1tsn+imin-1),v1t(1_${ik}$,imin), ldv1t )
                 end if
              end if
              if( wantv2t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_dlasr( 'L', 'V', 'F', imax-imin+1, m-q,work(iv2tcs+imin-1), work(&
                              iv2tsn+imin-1),v2t(imin,1_${ik}$), ldv2t )
                 else
                    call stdlib${ii}$_dlasr( 'R', 'V', 'F', m-q, imax-imin+1,work(iv2tcs+imin-1), work(&
                              iv2tsn+imin-1),v2t(1_${ik}$,imin), ldv2t )
                 end if
              end if
              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
              if( b11e(imax-1)+b21e(imax-1) > 0_${ik}$ ) then
                 b11d(imax) = -b11d(imax)
                 b21d(imax) = -b21d(imax)
                 if( wantv1t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_dscal( q, negone, v1t(imax,1_${ik}$), ldv1t )
                    else
                       call stdlib${ii}$_dscal( q, negone, v1t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! compute theta(imax)
              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
              theta(imax) = atan2( abs(y1), abs(x1) )
              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
              ! and b22(imax,imax-1)
              if( b11d(imax)+b12e(imax-1) < 0_${ik}$ ) then
                 b12d(imax) = -b12d(imax)
                 if( wantu1 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_dscal( p, negone, u1(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_dscal( p, negone, u1(imax,1_${ik}$), ldu1 )
                    end if
                 end if
              end if
              if( b21d(imax)+b22e(imax-1) > 0_${ik}$ ) then
                 b22d(imax) = -b22d(imax)
                 if( wantu2 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_dscal( m-p, negone, u2(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_dscal( m-p, negone, u2(imax,1_${ik}$), ldu2 )
                    end if
                 end if
              end if
              ! fix signs on b12(imax,imax) and b22(imax,imax)
              if( b12d(imax)+b22d(imax) < 0_${ik}$ ) then
                 if( wantv2t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_dscal( m-q, negone, v2t(imax,1_${ik}$), ldv2t )
                    else
                       call stdlib${ii}$_dscal( m-q, negone, v2t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! test for negligible sines or cosines
              do i = imin, imax
                 if( theta(i) < thresh ) then
                    theta(i) = zero
                 else if( theta(i) > piover2-thresh ) then
                    theta(i) = piover2
                 end if
              end do
              do i = imin, imax-1
                 if( phi(i) < thresh ) then
                    phi(i) = zero
                 else if( phi(i) > piover2-thresh ) then
                    phi(i) = piover2
                 end if
              end do
              ! deflate
              if (imax > 1_${ik}$) then
                 do while( phi(imax-1) == zero )
                    imax = imax - 1_${ik}$
                    if (imax <= 1) exit
                 end do
              end if
              if( imin > imax - 1_${ik}$ )imin = imax - 1_${ik}$
              if (imin > 1_${ik}$) then
                 do while (phi(imin-1) /= zero)
                     imin = imin - 1_${ik}$
                     if (imin <= 1) exit
                 end do
              end if
              ! repeat main iteration loop
           end do
           ! postprocessing: order theta from least to greatest
           do i = 1, q
              mini = i
              thetamin = theta(i)
              do j = i+1, q
                 if( theta(j) < thetamin ) then
                    mini = j
                    thetamin = theta(j)
                 end if
              end do
              if( mini /= i ) then
                 theta(mini) = theta(i)
                 theta(i) = thetamin
                 if( colmajor ) then
                    if( wantu1 )call stdlib${ii}$_dswap( p, u1(1_${ik}$,i), 1_${ik}$, u1(1_${ik}$,mini), 1_${ik}$ )
                    if( wantu2 )call stdlib${ii}$_dswap( m-p, u2(1_${ik}$,i), 1_${ik}$, u2(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv1t )call stdlib${ii}$_dswap( q, v1t(i,1_${ik}$), ldv1t, v1t(mini,1_${ik}$), ldv1t )
                              
                    if( wantv2t )call stdlib${ii}$_dswap( m-q, v2t(i,1_${ik}$), ldv2t, v2t(mini,1_${ik}$),ldv2t )
                              
                 else
                    if( wantu1 )call stdlib${ii}$_dswap( p, u1(i,1_${ik}$), ldu1, u1(mini,1_${ik}$), ldu1 )
                    if( wantu2 )call stdlib${ii}$_dswap( m-p, u2(i,1_${ik}$), ldu2, u2(mini,1_${ik}$), ldu2 )
                    if( wantv1t )call stdlib${ii}$_dswap( q, v1t(1_${ik}$,i), 1_${ik}$, v1t(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv2t )call stdlib${ii}$_dswap( m-q, v2t(1_${ik}$,i), 1_${ik}$, v2t(1_${ik}$,mini), 1_${ik}$ )
                 end if
              end if
           end do
           return
     end subroutine stdlib${ii}$_dbbcsd

#:for rk,rt,ri in REAL_KINDS_TYPES
#:if not rk in ["sp","dp"]
     pure module subroutine stdlib${ii}$_${ri}$bbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
     !! DBBCSD: computes the CS decomposition of an orthogonal matrix in
     !! bidiagonal-block form,
     !! [ B11 | B12 0  0 ]
     !! [  0  |  0 -I  0 ]
     !! X = [----------------]
     !! [ B21 | B22 0  0 ]
     !! [  0  |  0  0  I ]
     !! [  C | -S  0  0 ]
     !! [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**T
     !! = [---------] [---------------] [---------]   .
     !! [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
     !! [  0 |  0  0  I ]
     !! X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
     !! than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
     !! transposed and/or permuted. This can be done in constant time using
     !! the TRANS and SIGNS options. See DORCSD for details.)
     !! The bidiagonal matrices B11, B12, B21, and B22 are represented
     !! implicitly by angles THETA(1:Q) and PHI(1:Q-1).
     !! The orthogonal matrices U1, U2, V1T, and V2T are input/output.
     !! The input matrices are pre- or post-multiplied by the appropriate
     !! singular vector matrices.
     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, work, &
               lwork, info )
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${rk}$, only: negone, zero, one, ten, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lwork, m, p, q
           ! Array Arguments 
           real(${rk}$), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
                      b22e(*), work(*)
           real(${rk}$), intent(inout) :: phi(*), theta(*)
           real(${rk}$), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
        ! ===================================================================
           ! Parameters 
           integer(${ik}$), parameter :: maxitr = 6_${ik}$
           real(${rk}$), parameter :: hundred = 100.0_${rk}$
           real(${rk}$), parameter :: meighth = -0.125_${rk}$
           real(${rk}$), parameter :: piover2 = 1.57079632679489661923132169163975144210_${rk}$
           
           
           
           
           ! Local Scalars 
           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
                     wantu2, wantv1t, wantv2t
           integer(${ik}$) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
                     iv2tcs, iv2tsn, j, lworkmin, lworkopt, maxit, mini
           real(${rk}$) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
           ! Intrinsic Functions 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lwork == -1_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           if( m < 0_${ik}$ ) then
              info = -6_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -7_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -8_${ik}$
           else if( q > p .or. q > m-p .or. q > m-q ) then
              info = -8_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -12_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -14_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -16_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -18_${ik}$
           end if
           ! quick return if q = 0
           if( info == 0_${ik}$ .and. q == 0_${ik}$ ) then
              lworkmin = 1_${ik}$
              work(1_${ik}$) = lworkmin
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              iu1cs = 1_${ik}$
              iu1sn = iu1cs + q
              iu2cs = iu1sn + q
              iu2sn = iu2cs + q
              iv1tcs = iu2sn + q
              iv1tsn = iv1tcs + q
              iv2tcs = iv1tsn + q
              iv2tsn = iv2tcs + q
              lworkopt = iv2tsn + q - 1_${ik}$
              lworkmin = lworkopt
              work(1_${ik}$) = lworkopt
              if( lwork < lworkmin .and. .not. lquery ) then
                 info = -28_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'DBBCSD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! get machine constants
           eps = stdlib${ii}$_${ri}$lamch( 'EPSILON' )
           unfl = stdlib${ii}$_${ri}$lamch( 'SAFE MINIMUM' )
           tolmul = max( ten, min( hundred, eps**meighth ) )
           tol = tolmul*eps
           thresh = max( tol, maxitr*q*q*unfl )
           ! test for negligible sines or cosines
           do i = 1, q
              if( theta(i) < thresh ) then
                 theta(i) = zero
              else if( theta(i) > piover2-thresh ) then
                 theta(i) = piover2
              end if
           end do
           do i = 1, q-1
              if( phi(i) < thresh ) then
                 phi(i) = zero
              else if( phi(i) > piover2-thresh ) then
                 phi(i) = piover2
              end if
           end do
           ! initial deflation
           imax = q
           do while( imax > 1 )
              if( phi(imax-1) /= zero ) then
                 exit
              end if
              imax = imax - 1_${ik}$
           end do
           imin = imax - 1_${ik}$
           if  ( imin > 1_${ik}$ ) then
              do while( phi(imin-1) /= zero )
                 imin = imin - 1_${ik}$
                 if  ( imin <= 1 ) exit
              end do
           end if
           ! initialize iteration counter
           maxit = maxitr*q*q
           iter = 0_${ik}$
           ! begin main iteration loop
           do while( imax > 1 )
              ! compute the matrix entries
              b11d(imin) = cos( theta(imin) )
              b21d(imin) = -sin( theta(imin) )
              do i = imin, imax - 1
                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
              end do
              b12d(imax) = sin( theta(imax) )
              b22d(imax) = cos( theta(imax) )
              ! abort if not converging; otherwise, increment iter
              if( iter > maxit ) then
                 info = 0_${ik}$
                 do i = 1, q
                    if( phi(i) /= zero )info = info + 1_${ik}$
                 end do
                 return
              end if
              iter = iter + imax - imin
              ! compute shifts
              thetamax = theta(imin)
              thetamin = theta(imin)
              do i = imin+1, imax
                 if( theta(i) > thetamax )thetamax = theta(i)
                 if( theta(i) < thetamin )thetamin = theta(i)
              end do
              if( thetamax > piover2 - thresh ) then
                 ! zero on diagonals of b11 and b22; induce deflation with a
                 ! zero shift
                 mu = zero
                 nu = one
              else if( thetamin < thresh ) then
                 ! zero on diagonals of b12 and b22; induce deflation with a
                 ! zero shift
                 mu = one
                 nu = zero
              else
                 ! compute shifts for b11 and b21 and use the lesser
                 call stdlib${ii}$_${ri}$las2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
                           
                 call stdlib${ii}$_${ri}$las2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
                           
                 if( sigma11 <= sigma21 ) then
                    mu = sigma11
                    nu = sqrt( one - mu**2_${ik}$ )
                    if( mu < thresh ) then
                       mu = zero
                       nu = one
                    end if
                 else
                    nu = sigma21
                    mu = sqrt( one - nu**2_${ik}$ )
                    if( nu < thresh ) then
                       mu = one
                       nu = zero
                    end if
                 end if
              end if
              ! rotate to produce bulges in b11 and b21
              if( mu <= nu ) then
                 call stdlib${ii}$_${ri}$lartgs( b11d(imin), b11e(imin), mu,work(iv1tcs+imin-1), work(iv1tsn+&
                           imin-1) )
              else
                 call stdlib${ii}$_${ri}$lartgs( b21d(imin), b21e(imin), nu,work(iv1tcs+imin-1), work(iv1tsn+&
                           imin-1) )
              end if
              temp = work(iv1tcs+imin-1)*b11d(imin) +work(iv1tsn+imin-1)*b11e(imin)
              b11e(imin) = work(iv1tcs+imin-1)*b11e(imin) -work(iv1tsn+imin-1)*b11d(imin)
              b11d(imin) = temp
              b11bulge = work(iv1tsn+imin-1)*b11d(imin+1)
              b11d(imin+1) = work(iv1tcs+imin-1)*b11d(imin+1)
              temp = work(iv1tcs+imin-1)*b21d(imin) +work(iv1tsn+imin-1)*b21e(imin)
              b21e(imin) = work(iv1tcs+imin-1)*b21e(imin) -work(iv1tsn+imin-1)*b21d(imin)
              b21d(imin) = temp
              b21bulge = work(iv1tsn+imin-1)*b21d(imin+1)
              b21d(imin+1) = work(iv1tcs+imin-1)*b21d(imin+1)
              ! compute theta(imin)
              theta( imin ) = atan2( sqrt( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ ),sqrt( b11d(imin)**2_${ik}$+&
                        b11bulge**2_${ik}$ ) )
              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
              if( b11d(imin)**2_${ik}$+b11bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lartgp( b11bulge, b11d(imin), work(iu1sn+imin-1),work(iu1cs+imin-1),&
                            r )
              else if( mu <= nu ) then
                 call stdlib${ii}$_${ri}$lartgs( b11e( imin ), b11d( imin + 1_${ik}$ ), mu,work(iu1cs+imin-1), work(&
                           iu1sn+imin-1) )
              else
                 call stdlib${ii}$_${ri}$lartgs( b12d( imin ), b12e( imin ), nu,work(iu1cs+imin-1), work(&
                           iu1sn+imin-1) )
              end if
              if( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_${ri}$lartgp( b21bulge, b21d(imin), work(iu2sn+imin-1),work(iu2cs+imin-1),&
                            r )
              else if( nu < mu ) then
                 call stdlib${ii}$_${ri}$lartgs( b21e( imin ), b21d( imin + 1_${ik}$ ), nu,work(iu2cs+imin-1), work(&
                           iu2sn+imin-1) )
              else
                 call stdlib${ii}$_${ri}$lartgs( b22d(imin), b22e(imin), mu,work(iu2cs+imin-1), work(iu2sn+&
                           imin-1) )
              end if
              work(iu2cs+imin-1) = -work(iu2cs+imin-1)
              work(iu2sn+imin-1) = -work(iu2sn+imin-1)
              temp = work(iu1cs+imin-1)*b11e(imin) +work(iu1sn+imin-1)*b11d(imin+1)
              b11d(imin+1) = work(iu1cs+imin-1)*b11d(imin+1) -work(iu1sn+imin-1)*b11e(imin)
                        
              b11e(imin) = temp
              if( imax > imin+1 ) then
                 b11bulge = work(iu1sn+imin-1)*b11e(imin+1)
                 b11e(imin+1) = work(iu1cs+imin-1)*b11e(imin+1)
              end if
              temp = work(iu1cs+imin-1)*b12d(imin) +work(iu1sn+imin-1)*b12e(imin)
              b12e(imin) = work(iu1cs+imin-1)*b12e(imin) -work(iu1sn+imin-1)*b12d(imin)
              b12d(imin) = temp
              b12bulge = work(iu1sn+imin-1)*b12d(imin+1)
              b12d(imin+1) = work(iu1cs+imin-1)*b12d(imin+1)
              temp = work(iu2cs+imin-1)*b21e(imin) +work(iu2sn+imin-1)*b21d(imin+1)
              b21d(imin+1) = work(iu2cs+imin-1)*b21d(imin+1) -work(iu2sn+imin-1)*b21e(imin)
                        
              b21e(imin) = temp
              if( imax > imin+1 ) then
                 b21bulge = work(iu2sn+imin-1)*b21e(imin+1)
                 b21e(imin+1) = work(iu2cs+imin-1)*b21e(imin+1)
              end if
              temp = work(iu2cs+imin-1)*b22d(imin) +work(iu2sn+imin-1)*b22e(imin)
              b22e(imin) = work(iu2cs+imin-1)*b22e(imin) -work(iu2sn+imin-1)*b22d(imin)
              b22d(imin) = temp
              b22bulge = work(iu2sn+imin-1)*b22d(imin+1)
              b22d(imin+1) = work(iu2cs+imin-1)*b22d(imin+1)
              ! inner loop: chase bulges from b11(imin,imin+2),
              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
              ! bottom-right
              do i = imin+1, imax-1
                 ! compute phi(i-1)
                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
                 phi(i-1) = atan2( sqrt(x1**2_${ik}$+x2**2_${ik}$), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 = b11e(i-1)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 = b21e(i-1)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12d(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22d(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_${ri}$lartgp( x2, x1, work(iv1tsn+i-1), work(iv1tcs+i-1),r )
                 else if( .not. restart11 .and. restart21 ) then
                    call stdlib${ii}$_${ri}$lartgp( b11bulge, b11e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
                              r )
                 else if( restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_${ri}$lartgp( b21bulge, b21e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
                              r )
                 else if( mu <= nu ) then
                    call stdlib${ii}$_${ri}$lartgs( b11d(i), b11e(i), mu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
                              
                 else
                    call stdlib${ii}$_${ri}$lartgs( b21d(i), b21e(i), nu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
                              
                 end if
                 work(iv1tcs+i-1) = -work(iv1tcs+i-1)
                 work(iv1tsn+i-1) = -work(iv1tsn+i-1)
                 if( .not. restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_${ri}$lartgp( y2, y1, work(iv2tsn+i-1-1),work(iv2tcs+i-1-1), r )
                              
                 else if( .not. restart12 .and. restart22 ) then
                    call stdlib${ii}$_${ri}$lartgp( b12bulge, b12d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
                              1_${ik}$), r )
                 else if( restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_${ri}$lartgp( b22bulge, b22d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
                              1_${ik}$), r )
                 else if( nu < mu ) then
                    call stdlib${ii}$_${ri}$lartgs( b12e(i-1), b12d(i), nu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
                              1_${ik}$-1) )
                 else
                    call stdlib${ii}$_${ri}$lartgs( b22e(i-1), b22d(i), mu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
                              1_${ik}$-1) )
                 end if
                 temp = work(iv1tcs+i-1)*b11d(i) + work(iv1tsn+i-1)*b11e(i)
                 b11e(i) = work(iv1tcs+i-1)*b11e(i) -work(iv1tsn+i-1)*b11d(i)
                 b11d(i) = temp
                 b11bulge = work(iv1tsn+i-1)*b11d(i+1)
                 b11d(i+1) = work(iv1tcs+i-1)*b11d(i+1)
                 temp = work(iv1tcs+i-1)*b21d(i) + work(iv1tsn+i-1)*b21e(i)
                 b21e(i) = work(iv1tcs+i-1)*b21e(i) -work(iv1tsn+i-1)*b21d(i)
                 b21d(i) = temp
                 b21bulge = work(iv1tsn+i-1)*b21d(i+1)
                 b21d(i+1) = work(iv1tcs+i-1)*b21d(i+1)
                 temp = work(iv2tcs+i-1-1)*b12e(i-1) +work(iv2tsn+i-1-1)*b12d(i)
                 b12d(i) = work(iv2tcs+i-1-1)*b12d(i) -work(iv2tsn+i-1-1)*b12e(i-1)
                 b12e(i-1) = temp
                 b12bulge = work(iv2tsn+i-1-1)*b12e(i)
                 b12e(i) = work(iv2tcs+i-1-1)*b12e(i)
                 temp = work(iv2tcs+i-1-1)*b22e(i-1) +work(iv2tsn+i-1-1)*b22d(i)
                 b22d(i) = work(iv2tcs+i-1-1)*b22d(i) -work(iv2tsn+i-1-1)*b22e(i-1)
                 b22e(i-1) = temp
                 b22bulge = work(iv2tsn+i-1-1)*b22e(i)
                 b22e(i) = work(iv2tcs+i-1-1)*b22e(i)
                 ! compute theta(i)
                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
                 theta(i) = atan2( sqrt(y1**2_${ik}$+y2**2_${ik}$), sqrt(x1**2_${ik}$+x2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 =   b11d(i)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12e(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 =   b21d(i)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22e(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_${ri}$lartgp( x2, x1, work(iu1sn+i-1), work(iu1cs+i-1),r )
                 else if( .not. restart11 .and. restart12 ) then
                    call stdlib${ii}$_${ri}$lartgp( b11bulge, b11d(i), work(iu1sn+i-1),work(iu1cs+i-1), r )
                              
                 else if( restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_${ri}$lartgp( b12bulge, b12e(i-1), work(iu1sn+i-1),work(iu1cs+i-1), r )
                              
                 else if( mu <= nu ) then
                    call stdlib${ii}$_${ri}$lartgs( b11e(i), b11d(i+1), mu, work(iu1cs+i-1),work(iu1sn+i-1) )
                              
                 else
                    call stdlib${ii}$_${ri}$lartgs( b12d(i), b12e(i), nu, work(iu1cs+i-1),work(iu1sn+i-1) )
                              
                 end if
                 if( .not. restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_${ri}$lartgp( y2, y1, work(iu2sn+i-1), work(iu2cs+i-1),r )
                 else if( .not. restart21 .and. restart22 ) then
                    call stdlib${ii}$_${ri}$lartgp( b21bulge, b21d(i), work(iu2sn+i-1),work(iu2cs+i-1), r )
                              
                 else if( restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_${ri}$lartgp( b22bulge, b22e(i-1), work(iu2sn+i-1),work(iu2cs+i-1), r )
                              
                 else if( nu < mu ) then
                    call stdlib${ii}$_${ri}$lartgs( b21e(i), b21e(i+1), nu, work(iu2cs+i-1),work(iu2sn+i-1) )
                              
                 else
                    call stdlib${ii}$_${ri}$lartgs( b22d(i), b22e(i), mu, work(iu2cs+i-1),work(iu2sn+i-1) )
                              
                 end if
                 work(iu2cs+i-1) = -work(iu2cs+i-1)
                 work(iu2sn+i-1) = -work(iu2sn+i-1)
                 temp = work(iu1cs+i-1)*b11e(i) + work(iu1sn+i-1)*b11d(i+1)
                 b11d(i+1) = work(iu1cs+i-1)*b11d(i+1) -work(iu1sn+i-1)*b11e(i)
                 b11e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b11bulge = work(iu1sn+i-1)*b11e(i+1)
                    b11e(i+1) = work(iu1cs+i-1)*b11e(i+1)
                 end if
                 temp = work(iu2cs+i-1)*b21e(i) + work(iu2sn+i-1)*b21d(i+1)
                 b21d(i+1) = work(iu2cs+i-1)*b21d(i+1) -work(iu2sn+i-1)*b21e(i)
                 b21e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b21bulge = work(iu2sn+i-1)*b21e(i+1)
                    b21e(i+1) = work(iu2cs+i-1)*b21e(i+1)
                 end if
                 temp = work(iu1cs+i-1)*b12d(i) + work(iu1sn+i-1)*b12e(i)
                 b12e(i) = work(iu1cs+i-1)*b12e(i) - work(iu1sn+i-1)*b12d(i)
                 b12d(i) = temp
                 b12bulge = work(iu1sn+i-1)*b12d(i+1)
                 b12d(i+1) = work(iu1cs+i-1)*b12d(i+1)
                 temp = work(iu2cs+i-1)*b22d(i) + work(iu2sn+i-1)*b22e(i)
                 b22e(i) = work(iu2cs+i-1)*b22e(i) - work(iu2sn+i-1)*b22d(i)
                 b22d(i) = temp
                 b22bulge = work(iu2sn+i-1)*b22d(i+1)
                 b22d(i+1) = work(iu2cs+i-1)*b22d(i+1)
              end do
              ! compute phi(imax-1)
              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
              phi(imax-1) = atan2( abs(x1), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
              restart12 = b12d(imax-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
              restart22 = b22d(imax-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
              if( .not. restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_${ri}$lartgp( y2, y1, work(iv2tsn+imax-1-1),work(iv2tcs+imax-1-1), r )
                           
              else if( .not. restart12 .and. restart22 ) then
                 call stdlib${ii}$_${ri}$lartgp( b12bulge, b12d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
                           imax-1-1), r )
              else if( restart12 .and. .not. restart22 ) then
                 call stdlib${ii}$_${ri}$lartgp( b22bulge, b22d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
                           imax-1-1), r )
              else if( nu < mu ) then
                 call stdlib${ii}$_${ri}$lartgs( b12e(imax-1), b12d(imax), nu,work(iv2tcs+imax-1-1), work(&
                           iv2tsn+imax-1-1) )
              else
                 call stdlib${ii}$_${ri}$lartgs( b22e(imax-1), b22d(imax), mu,work(iv2tcs+imax-1-1), work(&
                           iv2tsn+imax-1-1) )
              end if
              temp = work(iv2tcs+imax-1-1)*b12e(imax-1) +work(iv2tsn+imax-1-1)*b12d(imax)
              b12d(imax) = work(iv2tcs+imax-1-1)*b12d(imax) -work(iv2tsn+imax-1-1)*b12e(imax-1)
                        
              b12e(imax-1) = temp
              temp = work(iv2tcs+imax-1-1)*b22e(imax-1) +work(iv2tsn+imax-1-1)*b22d(imax)
              b22d(imax) = work(iv2tcs+imax-1-1)*b22d(imax) -work(iv2tsn+imax-1-1)*b22e(imax-1)
                        
              b22e(imax-1) = temp
              ! update singular vectors
              if( wantu1 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_${ri}$lasr( 'R', 'V', 'F', p, imax-imin+1,work(iu1cs+imin-1), work(&
                              iu1sn+imin-1),u1(1_${ik}$,imin), ldu1 )
                 else
                    call stdlib${ii}$_${ri}$lasr( 'L', 'V', 'F', imax-imin+1, p,work(iu1cs+imin-1), work(&
                              iu1sn+imin-1),u1(imin,1_${ik}$), ldu1 )
                 end if
              end if
              if( wantu2 ) then
                 if( colmajor ) then
                    call stdlib${ii}$_${ri}$lasr( 'R', 'V', 'F', m-p, imax-imin+1,work(iu2cs+imin-1), work(&
                              iu2sn+imin-1),u2(1_${ik}$,imin), ldu2 )
                 else
                    call stdlib${ii}$_${ri}$lasr( 'L', 'V', 'F', imax-imin+1, m-p,work(iu2cs+imin-1), work(&
                              iu2sn+imin-1),u2(imin,1_${ik}$), ldu2 )
                 end if
              end if
              if( wantv1t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_${ri}$lasr( 'L', 'V', 'F', imax-imin+1, q,work(iv1tcs+imin-1), work(&
                              iv1tsn+imin-1),v1t(imin,1_${ik}$), ldv1t )
                 else
                    call stdlib${ii}$_${ri}$lasr( 'R', 'V', 'F', q, imax-imin+1,work(iv1tcs+imin-1), work(&
                              iv1tsn+imin-1),v1t(1_${ik}$,imin), ldv1t )
                 end if
              end if
              if( wantv2t ) then
                 if( colmajor ) then
                    call stdlib${ii}$_${ri}$lasr( 'L', 'V', 'F', imax-imin+1, m-q,work(iv2tcs+imin-1), work(&
                              iv2tsn+imin-1),v2t(imin,1_${ik}$), ldv2t )
                 else
                    call stdlib${ii}$_${ri}$lasr( 'R', 'V', 'F', m-q, imax-imin+1,work(iv2tcs+imin-1), work(&
                              iv2tsn+imin-1),v2t(1_${ik}$,imin), ldv2t )
                 end if
              end if
              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
              if( b11e(imax-1)+b21e(imax-1) > 0_${ik}$ ) then
                 b11d(imax) = -b11d(imax)
                 b21d(imax) = -b21d(imax)
                 if( wantv1t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_${ri}$scal( q, negone, v1t(imax,1_${ik}$), ldv1t )
                    else
                       call stdlib${ii}$_${ri}$scal( q, negone, v1t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! compute theta(imax)
              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
              theta(imax) = atan2( abs(y1), abs(x1) )
              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
              ! and b22(imax,imax-1)
              if( b11d(imax)+b12e(imax-1) < 0_${ik}$ ) then
                 b12d(imax) = -b12d(imax)
                 if( wantu1 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_${ri}$scal( p, negone, u1(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_${ri}$scal( p, negone, u1(imax,1_${ik}$), ldu1 )
                    end if
                 end if
              end if
              if( b21d(imax)+b22e(imax-1) > 0_${ik}$ ) then
                 b22d(imax) = -b22d(imax)
                 if( wantu2 ) then
                    if( colmajor ) then
                       call stdlib${ii}$_${ri}$scal( m-p, negone, u2(1_${ik}$,imax), 1_${ik}$ )
                    else
                       call stdlib${ii}$_${ri}$scal( m-p, negone, u2(imax,1_${ik}$), ldu2 )
                    end if
                 end if
              end if
              ! fix signs on b12(imax,imax) and b22(imax,imax)
              if( b12d(imax)+b22d(imax) < 0_${ik}$ ) then
                 if( wantv2t ) then
                    if( colmajor ) then
                       call stdlib${ii}$_${ri}$scal( m-q, negone, v2t(imax,1_${ik}$), ldv2t )
                    else
                       call stdlib${ii}$_${ri}$scal( m-q, negone, v2t(1_${ik}$,imax), 1_${ik}$ )
                    end if
                 end if
              end if
              ! test for negligible sines or cosines
              do i = imin, imax
                 if( theta(i) < thresh ) then
                    theta(i) = zero
                 else if( theta(i) > piover2-thresh ) then
                    theta(i) = piover2
                 end if
              end do
              do i = imin, imax-1
                 if( phi(i) < thresh ) then
                    phi(i) = zero
                 else if( phi(i) > piover2-thresh ) then
                    phi(i) = piover2
                 end if
              end do
              ! deflate
              if (imax > 1_${ik}$) then
                 do while( phi(imax-1) == zero )
                    imax = imax - 1_${ik}$
                    if (imax <= 1) exit
                 end do
              end if
              if( imin > imax - 1_${ik}$ )imin = imax - 1_${ik}$
              if (imin > 1_${ik}$) then
                 do while (phi(imin-1) /= zero)
                     imin = imin - 1_${ik}$
                     if (imin <= 1) exit
                 end do
              end if
              ! repeat main iteration loop
           end do
           ! postprocessing: order theta from least to greatest
           do i = 1, q
              mini = i
              thetamin = theta(i)
              do j = i+1, q
                 if( theta(j) < thetamin ) then
                    mini = j
                    thetamin = theta(j)
                 end if
              end do
              if( mini /= i ) then
                 theta(mini) = theta(i)
                 theta(i) = thetamin
                 if( colmajor ) then
                    if( wantu1 )call stdlib${ii}$_${ri}$swap( p, u1(1_${ik}$,i), 1_${ik}$, u1(1_${ik}$,mini), 1_${ik}$ )
                    if( wantu2 )call stdlib${ii}$_${ri}$swap( m-p, u2(1_${ik}$,i), 1_${ik}$, u2(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv1t )call stdlib${ii}$_${ri}$swap( q, v1t(i,1_${ik}$), ldv1t, v1t(mini,1_${ik}$), ldv1t )
                              
                    if( wantv2t )call stdlib${ii}$_${ri}$swap( m-q, v2t(i,1_${ik}$), ldv2t, v2t(mini,1_${ik}$),ldv2t )
                              
                 else
                    if( wantu1 )call stdlib${ii}$_${ri}$swap( p, u1(i,1_${ik}$), ldu1, u1(mini,1_${ik}$), ldu1 )
                    if( wantu2 )call stdlib${ii}$_${ri}$swap( m-p, u2(i,1_${ik}$), ldu2, u2(mini,1_${ik}$), ldu2 )
                    if( wantv1t )call stdlib${ii}$_${ri}$swap( q, v1t(1_${ik}$,i), 1_${ik}$, v1t(1_${ik}$,mini), 1_${ik}$ )
                    if( wantv2t )call stdlib${ii}$_${ri}$swap( m-q, v2t(1_${ik}$,i), 1_${ik}$, v2t(1_${ik}$,mini), 1_${ik}$ )
                 end if
              end if
           end do
           return
     end subroutine stdlib${ii}$_${ri}$bbcsd

#:endif
#:endfor

     pure module subroutine stdlib${ii}$_cbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, rwork, &
               lrwork, info )
     !! CBBCSD computes the CS decomposition of a unitary matrix in
     !! bidiagonal-block form,
     !! [ B11 | B12 0  0 ]
     !! [  0  |  0 -I  0 ]
     !! X = [----------------]
     !! [ B21 | B22 0  0 ]
     !! [  0  |  0  0  I ]
     !! [  C | -S  0  0 ]
     !! [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**H
     !! = [---------] [---------------] [---------]   .
     !! [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
     !! [  0 |  0  0  I ]
     !! X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
     !! than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
     !! transposed and/or permuted. This can be done in constant time using
     !! the TRANS and SIGNS options. See CUNCSD for details.)
     !! The bidiagonal matrices B11, B12, B21, and B22 are represented
     !! implicitly by angles THETA(1:Q) and PHI(1:Q-1).
     !! The unitary matrices U1, U2, V1T, and V2T are input/output.
     !! The input matrices are pre- or post-multiplied by the appropriate
     !! singular vector matrices.
     
        ! -- lapack computational routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, one, ten, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lrwork, m, p, q
           ! Array Arguments 
           real(sp), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
                      b22e(*), rwork(*)
           real(sp), intent(inout) :: phi(*), theta(*)
           complex(sp), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
                     
        ! ===================================================================
           ! Parameters 
           integer(${ik}$), parameter :: maxitr = 6_${ik}$
           real(sp), parameter :: hundred = 100.0_sp
           real(sp), parameter :: meighth = -0.125_sp
           real(sp), parameter :: piover2 = 1.57079632679489661923132169163975144210_sp
           
           
           
           
           ! Local Scalars 
           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
                     wantu2, wantv1t, wantv2t
           integer(${ik}$) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
                     iv2tcs, iv2tsn, j, lrworkmin, lrworkopt, maxit, mini
           real(sp) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
           ! Intrinsic Functions 
           ! Executable Statements 
           ! test input arguments
           info = 0_${ik}$
           lquery = lrwork == -1_${ik}$
           wantu1 = stdlib_lsame( jobu1, 'Y' )
           wantu2 = stdlib_lsame( jobu2, 'Y' )
           wantv1t = stdlib_lsame( jobv1t, 'Y' )
           wantv2t = stdlib_lsame( jobv2t, 'Y' )
           colmajor = .not. stdlib_lsame( trans, 'T' )
           if( m < 0_${ik}$ ) then
              info = -6_${ik}$
           else if( p < 0_${ik}$ .or. p > m ) then
              info = -7_${ik}$
           else if( q < 0_${ik}$ .or. q > m ) then
              info = -8_${ik}$
           else if( q > p .or. q > m-p .or. q > m-q ) then
              info = -8_${ik}$
           else if( wantu1 .and. ldu1 < p ) then
              info = -12_${ik}$
           else if( wantu2 .and. ldu2 < m-p ) then
              info = -14_${ik}$
           else if( wantv1t .and. ldv1t < q ) then
              info = -16_${ik}$
           else if( wantv2t .and. ldv2t < m-q ) then
              info = -18_${ik}$
           end if
           ! quick return if q = 0
           if( info == 0_${ik}$ .and. q == 0_${ik}$ ) then
              lrworkmin = 1_${ik}$
              rwork(1_${ik}$) = lrworkmin
              return
           end if
           ! compute workspace
           if( info == 0_${ik}$ ) then
              iu1cs = 1_${ik}$
              iu1sn = iu1cs + q
              iu2cs = iu1sn + q
              iu2sn = iu2cs + q
              iv1tcs = iu2sn + q
              iv1tsn = iv1tcs + q
              iv2tcs = iv1tsn + q
              iv2tsn = iv2tcs + q
              lrworkopt = iv2tsn + q - 1_${ik}$
              lrworkmin = lrworkopt
              rwork(1_${ik}$) = lrworkopt
              if( lrwork < lrworkmin .and. .not. lquery ) then
                 info = -28_${ik}$
              end if
           end if
           if( info /= 0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'CBBCSD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! get machine constants
           eps = stdlib${ii}$_slamch( 'EPSILON' )
           unfl = stdlib${ii}$_slamch( 'SAFE MINIMUM' )
           tolmul = max( ten, min( hundred, eps**meighth ) )
           tol = tolmul*eps
           thresh = max( tol, maxitr*q*q*unfl )
           ! test for negligible sines or cosines
           do i = 1, q
              if( theta(i) < thresh ) then
                 theta(i) = zero
              else if( theta(i) > piover2-thresh ) then
                 theta(i) = piover2
              end if
           end do
           do i = 1, q-1
              if( phi(i) < thresh ) then
                 phi(i) = zero
              else if( phi(i) > piover2-thresh ) then
                 phi(i) = piover2
              end if
           end do
           ! initial deflation
           imax = q
           do while( imax > 1 )
              if( phi(imax-1) /= zero ) then
                 exit
              end if
              imax = imax - 1_${ik}$
           end do
           imin = imax - 1_${ik}$
           if  ( imin > 1_${ik}$ ) then
              do while( phi(imin-1) /= zero )
                 imin = imin - 1_${ik}$
                 if  ( imin <= 1 ) exit
              end do
           end if
           ! initialize iteration counter
           maxit = maxitr*q*q
           iter = 0_${ik}$
           ! begin main iteration loop
           do while( imax > 1 )
              ! compute the matrix entries
              b11d(imin) = cos( theta(imin) )
              b21d(imin) = -sin( theta(imin) )
              do i = imin, imax - 1
                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
              end do
              b12d(imax) = sin( theta(imax) )
              b22d(imax) = cos( theta(imax) )
              ! abort if not converging; otherwise, increment iter
              if( iter > maxit ) then
                 info = 0_${ik}$
                 do i = 1, q
                    if( phi(i) /= zero )info = info + 1_${ik}$
                 end do
                 return
              end if
              iter = iter + imax - imin
              ! compute shifts
              thetamax = theta(imin)
              thetamin = theta(imin)
              do i = imin+1, imax
                 if( theta(i) > thetamax )thetamax = theta(i)
                 if( theta(i) < thetamin )thetamin = theta(i)
              end do
              if( thetamax > piover2 - thresh ) then
                 ! zero on diagonals of b11 and b22; induce deflation with a
                 ! zero shift
                 mu = zero
                 nu = one
              else if( thetamin < thresh ) then
                 ! zero on diagonals of b12 and b22; induce deflation with a
                 ! zero shift
                 mu = one
                 nu = zero
              else
                 ! compute shifts for b11 and b21 and use the lesser
                 call stdlib${ii}$_slas2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
                           
                 call stdlib${ii}$_slas2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
                           
                 if( sigma11 <= sigma21 ) then
                    mu = sigma11
                    nu = sqrt( one - mu**2_${ik}$ )
                    if( mu < thresh ) then
                       mu = zero
                       nu = one
                    end if
                 else
                    nu = sigma21
                    mu = sqrt( one - nu**2_${ik}$ )
                    if( nu < thresh ) then
                       mu = one
                       nu = zero
                    end if
                 end if
              end if
              ! rotate to produce bulges in b11 and b21
              if( mu <= nu ) then
                 call stdlib${ii}$_slartgs( b11d(imin), b11e(imin), mu,rwork(iv1tcs+imin-1), rwork(&
                           iv1tsn+imin-1) )
              else
                 call stdlib${ii}$_slartgs( b21d(imin), b21e(imin), nu,rwork(iv1tcs+imin-1), rwork(&
                           iv1tsn+imin-1) )
              end if
              temp = rwork(iv1tcs+imin-1)*b11d(imin) +rwork(iv1tsn+imin-1)*b11e(imin)
              b11e(imin) = rwork(iv1tcs+imin-1)*b11e(imin) -rwork(iv1tsn+imin-1)*b11d(imin)
                        
              b11d(imin) = temp
              b11bulge = rwork(iv1tsn+imin-1)*b11d(imin+1)
              b11d(imin+1) = rwork(iv1tcs+imin-1)*b11d(imin+1)
              temp = rwork(iv1tcs+imin-1)*b21d(imin) +rwork(iv1tsn+imin-1)*b21e(imin)
              b21e(imin) = rwork(iv1tcs+imin-1)*b21e(imin) -rwork(iv1tsn+imin-1)*b21d(imin)
                        
              b21d(imin) = temp
              b21bulge = rwork(iv1tsn+imin-1)*b21d(imin+1)
              b21d(imin+1) = rwork(iv1tcs+imin-1)*b21d(imin+1)
              ! compute theta(imin)
              theta( imin ) = atan2( sqrt( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ ),sqrt( b11d(imin)**2_${ik}$+&
                        b11bulge**2_${ik}$ ) )
              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
              if( b11d(imin)**2_${ik}$+b11bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_slartgp( b11bulge, b11d(imin), rwork(iu1sn+imin-1),rwork(iu1cs+imin-&
                           1_${ik}$), r )
              else if( mu <= nu ) then
                 call stdlib${ii}$_slartgs( b11e( imin ), b11d( imin + 1_${ik}$ ), mu,rwork(iu1cs+imin-1), &
                           rwork(iu1sn+imin-1) )
              else
                 call stdlib${ii}$_slartgs( b12d( imin ), b12e( imin ), nu,rwork(iu1cs+imin-1), rwork(&
                           iu1sn+imin-1) )
              end if
              if( b21d(imin)**2_${ik}$+b21bulge**2_${ik}$ > thresh**2_${ik}$ ) then
                 call stdlib${ii}$_slartgp( b21bulge, b21d(imin), rwork(iu2sn+imin-1),rwork(iu2cs+imin-&
                           1_${ik}$), r )
              else if( nu < mu ) then
                 call stdlib${ii}$_slartgs( b21e( imin ), b21d( imin + 1_${ik}$ ), nu,rwork(iu2cs+imin-1), &
                           rwork(iu2sn+imin-1) )
              else
                 call stdlib${ii}$_slartgs( b22d(imin), b22e(imin), mu,rwork(iu2cs+imin-1), rwork(iu2sn+&
                           imin-1) )
              end if
              rwork(iu2cs+imin-1) = -rwork(iu2cs+imin-1)
              rwork(iu2sn+imin-1) = -rwork(iu2sn+imin-1)
              temp = rwork(iu1cs+imin-1)*b11e(imin) +rwork(iu1sn+imin-1)*b11d(imin+1)
              b11d(imin+1) = rwork(iu1cs+imin-1)*b11d(imin+1) -rwork(iu1sn+imin-1)*b11e(imin)
                        
              b11e(imin) = temp
              if( imax > imin+1 ) then
                 b11bulge = rwork(iu1sn+imin-1)*b11e(imin+1)
                 b11e(imin+1) = rwork(iu1cs+imin-1)*b11e(imin+1)
              end if
              temp = rwork(iu1cs+imin-1)*b12d(imin) +rwork(iu1sn+imin-1)*b12e(imin)
              b12e(imin) = rwork(iu1cs+imin-1)*b12e(imin) -rwork(iu1sn+imin-1)*b12d(imin)
              b12d(imin) = temp
              b12bulge = rwork(iu1sn+imin-1)*b12d(imin+1)
              b12d(imin+1) = rwork(iu1cs+imin-1)*b12d(imin+1)
              temp = rwork(iu2cs+imin-1)*b21e(imin) +rwork(iu2sn+imin-1)*b21d(imin+1)
              b21d(imin+1) = rwork(iu2cs+imin-1)*b21d(imin+1) -rwork(iu2sn+imin-1)*b21e(imin)
                        
              b21e(imin) = temp
              if( imax > imin+1 ) then
                 b21bulge = rwork(iu2sn+imin-1)*b21e(imin+1)
                 b21e(imin+1) = rwork(iu2cs+imin-1)*b21e(imin+1)
              end if
              temp = rwork(iu2cs+imin-1)*b22d(imin) +rwork(iu2sn+imin-1)*b22e(imin)
              b22e(imin) = rwork(iu2cs+imin-1)*b22e(imin) -rwork(iu2sn+imin-1)*b22d(imin)
              b22d(imin) = temp
              b22bulge = rwork(iu2sn+imin-1)*b22d(imin+1)
              b22d(imin+1) = rwork(iu2cs+imin-1)*b22d(imin+1)
              ! inner loop: chase bulges from b11(imin,imin+2),
              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
              ! bottom-right
              do i = imin+1, imax-1
                 ! compute phi(i-1)
                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
                 phi(i-1) = atan2( sqrt(x1**2_${ik}$+x2**2_${ik}$), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 = b11e(i-1)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 = b21e(i-1)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12d(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22d(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_slartgp( x2, x1, rwork(iv1tsn+i-1),rwork(iv1tcs+i-1), r )
                 else if( .not. restart11 .and. restart21 ) then
                    call stdlib${ii}$_slartgp( b11bulge, b11e(i-1), rwork(iv1tsn+i-1),rwork(iv1tcs+i-1),&
                               r )
                 else if( restart11 .and. .not. restart21 ) then
                    call stdlib${ii}$_slartgp( b21bulge, b21e(i-1), rwork(iv1tsn+i-1),rwork(iv1tcs+i-1),&
                               r )
                 else if( mu <= nu ) then
                    call stdlib${ii}$_slartgs( b11d(i), b11e(i), mu, rwork(iv1tcs+i-1),rwork(iv1tsn+i-1)&
                               )
                 else
                    call stdlib${ii}$_slartgs( b21d(i), b21e(i), nu, rwork(iv1tcs+i-1),rwork(iv1tsn+i-1)&
                               )
                 end if
                 rwork(iv1tcs+i-1) = -rwork(iv1tcs+i-1)
                 rwork(iv1tsn+i-1) = -rwork(iv1tsn+i-1)
                 if( .not. restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( y2, y1, rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-1-1), r )
                              
                 else if( .not. restart12 .and. restart22 ) then
                    call stdlib${ii}$_slartgp( b12bulge, b12d(i-1), rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-&
                              1_${ik}$-1), r )
                 else if( restart12 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( b22bulge, b22d(i-1), rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-&
                              1_${ik}$-1), r )
                 else if( nu < mu ) then
                    call stdlib${ii}$_slartgs( b12e(i-1), b12d(i), nu,rwork(iv2tcs+i-1-1), rwork(iv2tsn+&
                              i-1-1) )
                 else
                    call stdlib${ii}$_slartgs( b22e(i-1), b22d(i), mu,rwork(iv2tcs+i-1-1), rwork(iv2tsn+&
                              i-1-1) )
                 end if
                 temp = rwork(iv1tcs+i-1)*b11d(i) + rwork(iv1tsn+i-1)*b11e(i)
                 b11e(i) = rwork(iv1tcs+i-1)*b11e(i) -rwork(iv1tsn+i-1)*b11d(i)
                 b11d(i) = temp
                 b11bulge = rwork(iv1tsn+i-1)*b11d(i+1)
                 b11d(i+1) = rwork(iv1tcs+i-1)*b11d(i+1)
                 temp = rwork(iv1tcs+i-1)*b21d(i) + rwork(iv1tsn+i-1)*b21e(i)
                 b21e(i) = rwork(iv1tcs+i-1)*b21e(i) -rwork(iv1tsn+i-1)*b21d(i)
                 b21d(i) = temp
                 b21bulge = rwork(iv1tsn+i-1)*b21d(i+1)
                 b21d(i+1) = rwork(iv1tcs+i-1)*b21d(i+1)
                 temp = rwork(iv2tcs+i-1-1)*b12e(i-1) +rwork(iv2tsn+i-1-1)*b12d(i)
                 b12d(i) = rwork(iv2tcs+i-1-1)*b12d(i) -rwork(iv2tsn+i-1-1)*b12e(i-1)
                 b12e(i-1) = temp
                 b12bulge = rwork(iv2tsn+i-1-1)*b12e(i)
                 b12e(i) = rwork(iv2tcs+i-1-1)*b12e(i)
                 temp = rwork(iv2tcs+i-1-1)*b22e(i-1) +rwork(iv2tsn+i-1-1)*b22d(i)
                 b22d(i) = rwork(iv2tcs+i-1-1)*b22d(i) -rwork(iv2tsn+i-1-1)*b22e(i-1)
                 b22e(i-1) = temp
                 b22bulge = rwork(iv2tsn+i-1-1)*b22e(i)
                 b22e(i) = rwork(iv2tcs+i-1-1)*b22e(i)
                 ! compute theta(i)
                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
                 theta(i) = atan2( sqrt(y1**2_${ik}$+y2**2_${ik}$), sqrt(x1**2_${ik}$+x2**2_${ik}$) )
                 ! determine if there are bulges to chase or if a new direct
                 ! summand has been reached
                 restart11 =   b11d(i)**2_${ik}$ + b11bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart12 = b12e(i-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart21 =   b21d(i)**2_${ik}$ + b21bulge**2_${ik}$ <= thresh**2_${ik}$
                 restart22 = b22e(i-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
                 ! chasing by applying the original shift again.
                 if( .not. restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_slartgp( x2, x1, rwork(iu1sn+i-1), rwork(iu1cs+i-1),r )
                 else if( .not. restart11 .and. restart12 ) then
                    call stdlib${ii}$_slartgp( b11bulge, b11d(i), rwork(iu1sn+i-1),rwork(iu1cs+i-1), r )
                              
                 else if( restart11 .and. .not. restart12 ) then
                    call stdlib${ii}$_slartgp( b12bulge, b12e(i-1), rwork(iu1sn+i-1),rwork(iu1cs+i-1), &
                              r )
                 else if( mu <= nu ) then
                    call stdlib${ii}$_slartgs( b11e(i), b11d(i+1), mu, rwork(iu1cs+i-1),rwork(iu1sn+i-1)&
                               )
                 else
                    call stdlib${ii}$_slartgs( b12d(i), b12e(i), nu, rwork(iu1cs+i-1),rwork(iu1sn+i-1) )
                              
                 end if
                 if( .not. restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( y2, y1, rwork(iu2sn+i-1), rwork(iu2cs+i-1),r )
                 else if( .not. restart21 .and. restart22 ) then
                    call stdlib${ii}$_slartgp( b21bulge, b21d(i), rwork(iu2sn+i-1),rwork(iu2cs+i-1), r )
                              
                 else if( restart21 .and. .not. restart22 ) then
                    call stdlib${ii}$_slartgp( b22bulge, b22e(i-1), rwork(iu2sn+i-1),rwork(iu2cs+i-1), &
                              r )
                 else if( nu < mu ) then
                    call stdlib${ii}$_slartgs( b21e(i), b21e(i+1), nu, rwork(iu2cs+i-1),rwork(iu2sn+i-1)&
                               )
                 else
                    call stdlib${ii}$_slartgs( b22d(i), b22e(i), mu, rwork(iu2cs+i-1),rwork(iu2sn+i-1) )
                              
                 end if
                 rwork(iu2cs+i-1) = -rwork(iu2cs+i-1)
                 rwork(iu2sn+i-1) = -rwork(iu2sn+i-1)
                 temp = rwork(iu1cs+i-1)*b11e(i) + rwork(iu1sn+i-1)*b11d(i+1)
                 b11d(i+1) = rwork(iu1cs+i-1)*b11d(i+1) -rwork(iu1sn+i-1)*b11e(i)
                 b11e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b11bulge = rwork(iu1sn+i-1)*b11e(i+1)
                    b11e(i+1) = rwork(iu1cs+i-1)*b11e(i+1)
                 end if
                 temp = rwork(iu2cs+i-1)*b21e(i) + rwork(iu2sn+i-1)*b21d(i+1)
                 b21d(i+1) = rwork(iu2cs+i-1)*b21d(i+1) -rwork(iu2sn+i-1)*b21e(i)
                 b21e(i) = temp
                 if( i < imax - 1_${ik}$ ) then
                    b21bulge = rwork(iu2sn+i-1)*b21e(i+1)
                    b21e(i+1) = rwork(iu2cs+i-1)*b21e(i+1)
                 end if
                 temp = rwork(iu1cs+i-1)*b12d(i) + rwork(iu1sn+i-1)*b12e(i)
                 b12e(i) = rwork(iu1cs+i-1)*b12e(i) -rwork(iu1sn+i-1)*b12d(i)
                 b12d(i) = temp
                 b12bulge = rwork(iu1sn+i-1)*b12d(i+1)
                 b12d(i+1) = rwork(iu1cs+i-1)*b12d(i+1)
                 temp = rwork(iu2cs+i-1)*b22d(i) + rwork(iu2sn+i-1)*b22e(i)
                 b22e(i) = rwork(iu2cs+i-1)*b22e(i) -rwork(iu2sn+i-1)*b22d(i)
                 b22d(i) = temp
                 b22bulge = rwork(iu2sn+i-1)*b22d(i+1)
                 b22d(i+1) = rwork(iu2cs+i-1)*b22d(i+1)
              end do
              ! compute phi(imax-1)
              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
              phi(imax-1) = atan2( abs(x1), sqrt(y1**2_${ik}$+y2**2_${ik}$) )
              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
              restart12 = b12d(imax-1)**2_${ik}$ + b12bulge**2_${ik}$ <= thresh**2_${ik}$
              restart22 = b22d(imax-1)**2_${ik}$ + b22bulge**2_${ik}$ <= thresh**2_${ik}$
              if( .not. restart12