eic-opticks/qudarap/qsim.h at 7b14deab52b0e372534eb8060f915b5a19fcdcf1 · BNLNPPS/eic-opticks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
qsim.h : GPU side struct prepared CPU side by QSim.hh
========================================================

qsim.h replaces the OptiX 6 context in a CUDA-centric way.
Canonical use is from CSGOptiX/CSGOptiX7.cu:simulate

* qsim.h instance is uploaded once only at CSGOptiX instanciation
  as this encompasses the physics not the event-by-event info.

* qsim encompasses global info relevant to all photons, meaning that any changes
  made to the qsim instance from single photon threads must be into thread-owned "idx"
  slots into arrays to avoid interference

* temporary working state local to each photon is held in *sctx*
  and passed around using reference arguments

TODO:

1. get more of the below to work on CPU with mocked curand (and in future mocked tex2D and cudaTextureObject_t )
   NB must move decl and implementation to do this

**/

#if defined(__CUDACC__) || defined(__CUDABE__)
   #define QSIM_METHOD __device__
#else
   #define QSIM_METHOD
#endif

#include "OpticksGenstep.h"
#include "OpticksPhoton.h"

#include "sflow.h"
#include "sqat4.h"
#include "sc4u.h"
#include "sxyz.h"
#include "sphoton.h"

#include "storch.h"
#include "scarrier.h"
#include "sevent.h"
#include "sstate.h"
#include "smatsur.h"


#ifndef PRODUCTION
#include "srec.h"
#include "sseq.h"
#include "stag.h"
#ifdef DEBUG_LOGF
#define KLUDGE_FASTMATH_LOGF(u) (u < 0.998f ? __logf(u) : __logf(u) - 0.46735790f*1e-7f )
#endif
#endif

#include "sctx.h"

#include "qrng.h"
#include "qbase.h"
#include "qprop.h"
#include "qmultifilm.h"
#include "qbnd.h"
#include "qscint.h"
#include "qcerenkov.h"
#include "qpmt.h"
#include "tcomplex.h"


struct qcerenkov ;

struct qsim
{
    qbase*              base ;
    sevent*             evt ;
    qrng<RNG>*          rng ;
    qbnd*               bnd ;
    qmultifilm*         multifilm;
    qcerenkov*          cerenkov ;
    qscint*             scint ;
    qpmt<float>*        pmt ;

#if defined(__CUDACC__) || defined(__CUDABE__)
#else
    qsim(); // instanciated on CPU (see QSim::init_sim) and copied to device so no ctor in device code
#endif

    QSIM_METHOD void    generate_photon_dummy( sphoton& p, RNG& rng, const quad6& gs, unsigned long long photon_id, unsigned genstep_id ) const ;
    QSIM_METHOD static float3 uniform_sphere(const float u0, const float u1);
    QSIM_METHOD static float RandGaussQ_shoot( RNG& rng, float mean, float stdDev );
    QSIM_METHOD static void SmearNormal_SigmaAlpha( RNG& rng, float3* smeared_normal, const float3* direction, const float3* normal, float sigma_alpha, const sctx& ctx );
    QSIM_METHOD static void SmearNormal_Polish(     RNG& rng, float3* smeared_normal, const float3* direction, const float3* normal, float polish     , const sctx& ctx );

#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND ) || defined(MOCK_CUDA)
    QSIM_METHOD static float3 uniform_sphere(RNG& rng);
#endif

#if defined(__CUDACC__) || defined(__CUDABE__)
    QSIM_METHOD float4  multifilm_lookup(unsigned pmtType, float nm, float aoi);
#endif

#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND )  || defined(MOCK_CUDA)
    QSIM_METHOD static void lambertian_direction(float3* dir, const float3* normal, float orient, RNG& rng, sctx& ctx );
    QSIM_METHOD static void random_direction_marsaglia(float3* dir, RNG& rng, sctx& ctx );
    QSIM_METHOD void rayleigh_scatter(RNG& rng, sctx& ctx );
    QSIM_METHOD int     propagate_to_boundary( unsigned& flag, RNG& rng, sctx& ctx );
#endif

#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND ) || defined(MOCK_CUDA)
    QSIM_METHOD int     propagate_at_boundary(        unsigned& flag, RNG& rng, sctx& ctx, float theTransmittance=-1.f ) const ;
    QSIM_METHOD int     propagate_at_boundary_with_T( unsigned& flag, RNG& rng, sctx& ctx, float theTransmittance ) const ;
#endif

#if defined(__CUDACC__) || defined(__CUDABE__)
    QSIM_METHOD int     propagate_at_surface_MultiFilm(unsigned& flag, RNG& rng, sctx& ctx );
#endif

#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND ) || defined(MOCK_CUDA)
    QSIM_METHOD int     propagate_at_surface(           unsigned& flag, RNG& rng, sctx& ctx );
    QSIM_METHOD int     propagate_at_surface_Detect(    unsigned& flag, RNG& rng, sctx& ctx ) const ;
#if defined( WITH_CUSTOM4 )
    QSIM_METHOD int     propagate_at_surface_CustomART( unsigned& flag, RNG& rng, sctx& ctx ) const ;
#endif
#endif

#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND ) || defined(MOCK_CUDA)
    QSIM_METHOD void    reflect_diffuse(                       RNG& rng, sctx& ctx );
    QSIM_METHOD void    reflect_specular(                      RNG& rng, sctx& ctx );

    QSIM_METHOD void    fake_propagate( sphoton& p, const quad2* mock_prd, RNG& rng, unsigned long long idx );
    QSIM_METHOD int     propagate(const int bounce, RNG& rng, sctx& ctx );

    QSIM_METHOD void    hemisphere_polarized( unsigned polz, bool inwards, RNG& rng, sctx& ctx );
    QSIM_METHOD void    generate_photon_simtrace(         quad4&   p, RNG& rng, const quad6& gs, unsigned long long photon_id, unsigned genstep_id ) const ;
    QSIM_METHOD void    generate_photon_simtrace_frame(   quad4&   p, RNG& rng, const quad6& gs, unsigned long long photon_id, unsigned genstep_id ) const ;
    QSIM_METHOD void    generate_photon(                  sphoton& p, RNG& rng, const quad6& gs, unsigned long long photon_id, unsigned genstep_id ) const ;
#endif
};

// CTOR
#if defined(__CUDACC__) || defined(__CUDABE__)
#else
inline qsim::qsim()    // instanciated on CPU (see QSim::init_sim) and copied to device so no ctor in device code
        :
        base(nullptr),
        evt(nullptr),
        rng(nullptr),
        bnd(nullptr),
        multifilm(nullptr),
        cerenkov(nullptr),
        scint(nullptr),
        pmt(nullptr)
    {
    }
#endif

inline QSIM_METHOD void qsim::generate_photon_dummy(sphoton& p_, RNG& rng, const quad6& gs, unsigned long long photon_id, unsigned genstep_id ) const
{
    quad4& p = (quad4&)p_ ;
#ifndef PRODUCTION
    printf("//qsim::generate_photon_dummy  photon_id %3lld genstep_id %3d  gs.q0.i ( gencode:%3d %3d %3d %3d ) \n",
       photon_id,
       genstep_id,
       gs.q0.i.x,
       gs.q0.i.y,
       gs.q0.i.z,
       gs.q0.i.w
      );
#endif
    p.q0.i.x = 1 ; p.q0.i.y = 2 ; p.q0.i.z = 3 ; p.q0.i.w = 4 ;
    p.q1.i.x = 1 ; p.q1.i.y = 2 ; p.q1.i.z = 3 ; p.q1.i.w = 4 ;
    p.q2.i.x = 1 ; p.q2.i.y = 2 ; p.q2.i.z = 3 ; p.q2.i.w = 4 ;
    p.q3.i.x = 1 ; p.q3.i.y = 2 ; p.q3.i.z = 3 ; p.q3.i.w = 4 ;

    p.set_flag(TORCH);
}

inline QSIM_METHOD float3 qsim::uniform_sphere(const float u0, const float u1)
{
    float phi = u0*2.f*M_PIf;
    float cosTheta = 2.f*u1 - 1.f ; // -1.f -> 1.f
    float sinTheta = sqrtf(1.f-cosTheta*cosTheta);
    return make_float3(cosf(phi)*sinTheta, sinf(phi)*sinTheta, cosTheta);
}


#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND ) || defined(MOCK_CUDA)
/**
qsim::uniform_sphere
---------------------

**/
inline QSIM_METHOD float3 qsim::uniform_sphere(RNG& rng)
{
    float phi = curand_uniform(&rng)*2.f*M_PIf;
    float cosTheta = 2.f*curand_uniform(&rng) - 1.f ; // -1.f -> 1.f
    float sinTheta = sqrtf(1.f-cosTheta*cosTheta);
    return make_float3(cosf(phi)*sinTheta, sinf(phi)*sinTheta, cosTheta);
}

/**
qsim::RandGaussQ_shoot
------------------------

See::

    sysrap/tests/erfcinvf_Test.sh
    sysrap/tests/S4MTRandGaussQTest.sh

    g4-cls RandGaussQ
    g4-cls G4MTRandGaussQ

**/
inline QSIM_METHOD float qsim::RandGaussQ_shoot( RNG& rng, float mean, float stdDev )
{
    float u2 = 2.f*curand_uniform(&rng) ;
    float v = -M_SQRT2f*erfcinvf(u2)*stdDev + mean ;
    //printf("//qsim.RandGaussQ_shoot mean %10.5f stdDev %10.5f u2 %10.5f v %10.5f \n", mean, stdDev, u2, v  ) ;
    return v ;
}


/**
qsim::SmearNormal_SigmaAlpha
------------------------------

CAUTION : THIS CURRENTLY NOT USED BY ANYTHING OTHER THAN TESTS

The reason is that a Geant4 simulation with some sigma_alpha/polish
surfaces will not actually use G4OpBoundaryProcess::GetFacetNormal unless
a bunch of conditions are satisfied such as having non-zero values of some
of prob_sl/prob_ss/prob_bs that induces G4OpBoundaryProcess::ChooseReflection
to return something other that LambertianReflection.

**THIS MAKES ME SUSPECT ACTUAL USE OF NORMAL SMEARING IS VERY RARE**

+----------------------+-------------------------------+------------------------------+----------------------+
| G4OpBoundaryProcess  | G4MaterialPropertiesIndex.hh  | G4MaterialPropertiesTable.cc | ChooseReflection     |
+======================+===============================+==============================+======================+
| prob_sl              | kSPECULARLOBECONSTANT         | SPECULARLOBECONSTANT         | LobeReflection       |
+----------------------+-------------------------------+------------------------------+----------------------+
| prob_ss              | kSPECULARSPIKECONSTANT        | SPECULARSPIKECONSTANT        | SpikeReflection      |
+----------------------+-------------------------------+------------------------------+----------------------+
| prob_bs              | kBACKSCATTERCONSTANT          | BACKSCATTERCONSTANT          | BackScattering       |
+----------------------+-------------------------------+------------------------------+----------------------+
| all zero =>          |                               |                              | LambertianReflection |
+----------------------+-------------------------------+------------------------------+----------------------+

TODO: full simulation run with breakpoint "BP=C4OpBoundaryProcess::GetFacetNormal"

* C4 (not G4) as Custom4 is in use for the boundary process

**/

inline QSIM_METHOD void qsim::SmearNormal_SigmaAlpha(
    RNG& rng,
    float3* smeared_normal,
    const float3* direction,
    const float3* normal,
    float sigma_alpha,
    const sctx& ctx
   )
{
#if !defined(PRODUCTION) && defined(MOCK_CUDA_DEBUG)
    bool dump = ctx.pidx == -1 ;
#endif

    if(sigma_alpha == 0.f)
    {
        *smeared_normal = *normal ;
        return ;
    }
    float f_max = fminf(1.f,4.f*sigma_alpha);

#if !defined(PRODUCTION) && defined(MOCK_CUDA_DEBUG)
    if(dump) printf("//qsim::SmearNormal_SigmaAlpha.MOCK_CUDA_DEBUG sigma_alpha %10.5f f_max %10.5f  \n", sigma_alpha, f_max );
#endif

    float alpha, sin_alpha, phi, u0, u1, u2 ;
    bool reject_alpha ;
    bool reject_dir ;

    do {
        do {
            //alpha = RandGaussQ_shoot(rng, 0.f, sigma_alpha );  // mean:0.f stdDev:sigma_alpha
            u0 = curand_uniform(&rng) ;
            alpha = -M_SQRT2f*erfcinvf(2.f*u0)*sigma_alpha ;

            sin_alpha = sinf(alpha);
            u1 = curand_uniform(&rng) ;
            reject_alpha = alpha >= M_PIf/2.f || (u1*f_max > sin_alpha) ;

#if !defined(PRODUCTION) && defined(MOCK_CUDA_DEBUG)
            if(dump) printf("//qsim::SmearNormal_SigmaAlpha.MOCK_CUDA_DEBUG u0 %10.5f alpha %10.5f sin_alpha %10.5f u1 %10.5f u1*f_max %10.5f  (u1*f_max > sin_alpha) %d reject_alpha %d  \n",
               u0, alpha, sin_alpha, u1, u1*f_max, (u1*f_max > sin_alpha), reject_alpha );
            // theres lots of alpha rejected : eg all -ve sin_alpha
#endif

        } while( reject_alpha ) ;

        u2 = curand_uniform(&rng) ;
        phi = u2*M_PIf*2.f ;

        smeared_normal->x = sin_alpha * cosf(phi) ;
        smeared_normal->y = sin_alpha * sinf(phi) ;
        smeared_normal->z = cosf(alpha) ;

        smath::rotateUz(*smeared_normal, *normal);
        reject_dir = dot(*smeared_normal, *direction ) >= 0.f ;
        // reject smears that move the normal into same hemi as direction

#if !defined(PRODUCTION) && defined(MOCK_CUDA_DEBUG)
        if(dump) printf("//qsim::SmearNormal_SigmaAlpha.MOCK_CUDA_DEBUG u2 %10.5f phi %10.5f smeared_normal ( %10.5f, %10.5f, %10.5f)  reject_dir %d  \n",
               u2, phi, smeared_normal->x, smeared_normal->y, smeared_normal->z, reject_dir );
#endif


    } while( reject_dir ) ;
}

/**
qsim::SmearNormal_Polish
------------------------------

CAUTION : THIS CURRENTLY NOT USED BY ANYTHING OTHER THAN TESTS : SEE DETAILS ABOVE

**/

inline QSIM_METHOD void qsim::SmearNormal_Polish(
    RNG& rng,
    float3* smeared_normal,
    const float3* direction,
    const float3* normal,
    float polish,
    const sctx& ctx
    )
{
#if !defined(PRODUCTION) && defined(MOCK_CUDA_DEBUG)
    bool dump = ctx.pidx == -1 ;
#endif

    if(polish == 1.f)
    {
        *smeared_normal = *normal ;
        return ;
    }

    float u0, u1, u2 ;
    float3 smear ;
    bool reject_mag ;
    bool reject_dir ;

    do {
        do {
            u0 = curand_uniform(&rng);
            u1 = curand_uniform(&rng) ;
            u2 = curand_uniform(&rng) ;
            smear.x = 2.f*u0 - 1.f ;
            smear.y = 2.f*u1 - 1.f ;
            smear.z = 2.f*u2 - 1.f ;
            reject_mag = length(smear) > 1.f  ;   // HMM: could this use just dot(smear, smear) ?
       }
       while( reject_mag );

       *smeared_normal = *normal + (1.f-polish)*smear;
       reject_dir = dot(*smeared_normal, *direction) >= 0.f ;
    }
    while( reject_dir );
    *smeared_normal = normalize(*smeared_normal);
}


#endif

#if defined(__CUDACC__) || defined(__CUDABE__)
/*
qsim::multifilm_lookup
-----------------------

*/

inline QSIM_METHOD float4 qsim::multifilm_lookup(unsigned pmtType, float nm, float aoi)
{
    float4 value = multifilm->lookup(pmtType, nm, aoi);
    return value;
}

#endif

#if defined(__CUDACC__) || defined(__CUDABE__) || defined(MOCK_CURAND) || defined(MOCK_CUDA)

/**
qsim::lambertian_direction following G4LambertianRand
--------------------------------------------------------

g4-cls G4RandomTools::

     59 inline G4ThreeVector G4LambertianRand(const G4ThreeVector& normal)
     60 {
     61   G4ThreeVector vect;
     62   G4double ndotv;
     63   G4int count=0;
     64   const G4int max_trials = 1024;
     65
     66   do
     67   {
     68     ++count;
     69     vect = G4RandomDirection();
     70     ndotv = normal * vect;
     71
     72     if (ndotv < 0.0)
     73     {
     74       vect = -vect;
     75       ndotv = -ndotv;
     76     }
     77
     78   } while (!(G4UniformRand() < ndotv) && (count < max_trials));
     79
     80   return vect;
     81 }


NB: potentially bad for performance for dir pointer to be into global mem
as opposed to local stack float3 : as this keeps changing the dir before
arriving at the final one

**/
inline  QSIM_METHOD void qsim::lambertian_direction(float3* dir, const float3* normal, float orient, RNG& rng, sctx& ctx )
{
#if !defined(PRODUCTION) && defined(DEBUG_PIDX)
    unsigned long long PIDX = 0xffffffffff ;
    if(ctx.pidx == PIDX )
    {
        printf("//qsim.lambertian_direction.head pidx %7lld : normal = np.array([%10.5f,%10.5f,%10.5f]) ; orient = %10.5f  \n",
            ctx.pidx, normal->x, normal->y, normal->z, orient  );
    }
#endif

    float ndotv ;
    int count = 0 ;
    float u ;
    do
    {
        count++ ;
        random_direction_marsaglia(dir, rng, ctx); // sets dir to random point on unit sphere
        ndotv = dot( *dir, *normal )*orient ;
        if( ndotv < 0.f )
        {
            *dir = -1.f*(*dir) ;
            ndotv = -1.f*ndotv ;
        }
        // when random dir is in opposite hemisphere to oriented normal
        // flip the dir into same hemi and ndotv

        u = curand_uniform(&rng) ;

#if !defined(PRODUCTION) && defined(DEBUG_PIDX)

        if(ctx.pidx == PIDX)
        {
            printf("//qsim.lambertian_direction.loop pidx %7lld : dir = np.array([%10.5f,%10.5f,%10.5f]) ; count = %d ; ndotv = %10.5f ; u = %10.5f \n",
                ctx.pidx, dir->x, dir->y, dir->z, count, ndotv, u   );

        }
#endif
    }
    while (!(u < ndotv) && (count < 1024)) ;
    // distribution looks pretty similar without the while loop


#if !defined(PRODUCTION) && defined(DEBUG_PIDX)
    if(ctx.pidx == PIDX)
    {
        printf("//qsim.lambertian_direction.tail pidx %7lld : dir = np.array([%10.5f,%10.5f,%10.5f]) ; count = %d ; ndotv = %10.5f \n",
            ctx.pidx, dir->x, dir->y, dir->z, count, ndotv  );

    }
#endif


}

/**
qsim::random_direction_marsaglia following G4RandomDirection
-------------------------------------------------------------

* https://mathworld.wolfram.com/SpherePointPicking.html

::

                          v
                          |
              +---------.-|- -----------+
              |      .    |     .       |
              |   .       |          .  |
              |           |             |
              | .         |            .|
              |           |             |
          ----+-----------0-------------+---- u
              |.          |             |
              |           |            .|
              | .         |             |
              |           |          .  |
              |    .      |      .      |
              +--------.--|--.----------+
                          |

Marsaglia (1972) derived an elegant method that consists of picking u and v from independent
uniform distributions on (-1,1) and rejecting points for which uu+vv >=1.
So are picking points from within the (u,v) disc.

For those remaining random points on the 2D (u,v) disc the below (u,v) to (x,y,z)
mapping is used to give a 3D position on the unit sphere,

    x=2*u*sqrt(1-(uu+vv))
    y=2*v*sqrt(1-(uu+vv))
    z=1-2(uu+vv)

Checking normalization, it reduces to 1::

   xx + yy + zz =
         4uu (1-(uu+vv))
         4vv (1-(uu+vv)) +
        1 -4(uu+vv) + 4(uu+vv)(uu+vv)
                = 1

So that means the random 3D (x,y,z) points are on the unit sphere.

::

     g4-cls G4RandomDirection

     58 // G.Marsaglia (1972) method
     59 inline G4ThreeVector G4RandomDirection()
     60 {
     61   G4double u, v, b;
     62   do {
     63     u = 2.*G4UniformRand() - 1.;
     64     v = 2.*G4UniformRand() - 1.;
     65     b = u*u + v*v;
     66   } while (b > 1.);
     67   G4double a = 2.*std::sqrt(1. - b);
     68   return G4ThreeVector(a*u, a*v, 2.*b - 1.);
     69 }

**/


inline QSIM_METHOD void qsim::random_direction_marsaglia(float3* dir,  RNG& rng, sctx& ctx  )
{
    // NB: no use of ctx.tagr so this has not been random aligned
    float u0, u1 ;
    float u, v, b, a  ;
    do
    {
        u0 = curand_uniform(&rng);
        u1 = curand_uniform(&rng);
        //if( idx == 0u ) printf("//qsim.random_direction_marsaglia pidx %7lld u0 %10.4f u1 %10.4f \n", ctx.pidx, u0, u1 );
        u = 2.f*u0 - 1.f ;
        v = 2.f*u1 - 1.f ;
        b = u*u + v*v ;
    }
    while( b > 1.f ) ;

    a = 2.f*sqrtf( 1.f - b );

    dir->x = a*u ;
    dir->y = a*v ;
    dir->z = 2.f*b - 1.f ;
}


/**
qsim::rayleigh_scatter
------------------------------

Following G4OpRayleigh::PostStepDoIt

* https://bugzilla-geant4.kek.jp/show_bug.cgi?id=207 Xin Qian patch


Transverse wave nature means::

   dot(p_direction, p_polarization)  = 0
   dot(direction,   polarization)  = 0

*constant* and normalized direction retains transversality thru the candidate scatter::

    pol = p_pol + constant*dir

    dot(pol, dir) = dot(p_pol, dir) + constant* dot(dir, dir)
                  = dot(p_pol, dir) + constant* 1.
                  = dot(p_pol, dir) - dot(p_pol, dir)
                  = 0.

**/

inline QSIM_METHOD void qsim::rayleigh_scatter(RNG& rng, sctx& ctx )
{
    sphoton& p = ctx.p ;
    float3 direction ;
    float3 polarization ;

    bool looping(true) ;
    do
    {
        float u0 = curand_uniform(&rng) ;
        float u1 = curand_uniform(&rng) ;
        float u2 = curand_uniform(&rng) ;
        float u3 = curand_uniform(&rng) ;
        float u4 = curand_uniform(&rng) ;

#if !defined(PRODUCTION) && defined(DEBUG_TAG)
        stagr& tagr = ctx.tagr ;  // UNTESTED
        tagr.add(stag_sc, u0);
        tagr.add(stag_sc, u1);
        tagr.add(stag_sc, u2);
        tagr.add(stag_sc, u3);
        tagr.add(stag_sc, u4);
#endif
        float cosTheta = u0 ;
        float sinTheta = sqrtf(1.0f-u0*u0);
        if(u1 < 0.5f ) cosTheta = -cosTheta ;
        // could use uniform_sphere here : but not doing so to follow G4OpRayleigh more closely

        float sinPhi ;
        float cosPhi ;

#if defined(MOCK_CURAND ) || defined(MOCK_CUDA)
        //__sincosf(2.f*M_PIf*u2,&sinPhi,&cosPhi);   // apple extension
        float phi = 2.f*M_PIf*u2 ;
        sinPhi = sinf(phi);
        cosPhi = cosf(phi);
#else
        sincosf(2.f*M_PIf*u2,&sinPhi,&cosPhi);
#endif

        direction.x = sinTheta * cosPhi;
        direction.y = sinTheta * sinPhi;
        direction.z = cosTheta ;

        smath::rotateUz(direction, p.mom );

        float constant = -dot(direction, p.pol );

        polarization.x = p.pol.x + constant*direction.x ;
        polarization.y = p.pol.y + constant*direction.y ;
        polarization.z = p.pol.z + constant*direction.z ;

        if(dot(polarization, polarization) == 0.f )
        {

#if defined( MOCK_CURAND ) || defined(MOCK_CUDA)
            //__sincosf(2.f*M_PIf*u3,&sinPhi,&cosPhi);
            phi = 2.f*M_PIf*u3 ;
            sinPhi = sinf(phi);
            cosPhi = cosf(phi);
#else
            sincosf(2.f*M_PIf*u3,&sinPhi,&cosPhi);
#endif

            polarization.x = cosPhi ;
            polarization.y = sinPhi ;
            polarization.z = 0.f ;

            smath::rotateUz(polarization, direction);
        }
        else
        {
            // There are two directions which are perpendicular
            // to the new momentum direction
            if(u3 < 0.5f) polarization = -polarization ;
        }
        polarization = normalize(polarization);

        // simulate according to the distribution cos^2(theta)
        // where theta is the angle between old and new polarizations
        float doCosTheta = dot(polarization, p.pol ) ;
        float doCosTheta2 = doCosTheta*doCosTheta ;
        looping = doCosTheta2 < u4 ;

    } while ( looping ) ;

    p.mom = direction ;
    p.pol = polarization ;
}


/**
qsim::propagate_to_boundary
------------------------------

+---------------------+------------------+---------------------------------------------------------+-------------------------------------------------------+
| flag                |   command        |  changed                                                |  note                                                 |
+=====================+==================+=========================================================+=======================================================+
|   BULK_REEMIT       |   CONTINUE       |  time, position, direction, polarization, wavelength    | advance to reemit position with everything changed    |
+---------------------+------------------+---------------------------------------------------------+-------------------------------------------------------+
|   BULK_SCATTER      |   CONTINUE       |  time, position, direction, polarization                | advance to scatter position, new dir+pol              |
+---------------------+------------------+---------------------------------------------------------+-------------------------------------------------------+
|   BULK_ABSORB       |   BREAK          |  time, position                                         | advance to absorption position, dir+pol unchanged     |
+---------------------+------------------+---------------------------------------------------------+-------------------------------------------------------+
|   not set "SAIL"    |   BOUNDARY       |  time, position                                         | advanced to border position, dir+pol unchanged        |
+---------------------+------------------+---------------------------------------------------------+-------------------------------------------------------+


TODO:
   whilst in measurement iteration try changing the four "return"
   in the below into a single return: by setting command variable
   and returning that

**/


inline QSIM_METHOD int qsim::propagate_to_boundary(unsigned& flag, RNG& rng, sctx& ctx)
{
    sphoton& p = ctx.p ;
    const sstate& s = ctx.s ;

    const float& absorption_length = s.material1.y ;
    const float& scattering_length = s.material1.z ;
    const float& reemission_prob = s.material1.w ;
    const float& group_velocity = ctx.current_group_velocity;
    const float& distance_to_boundary = ctx.prd->q0.f.w ;


#if !defined(PRODUCTION) && defined(DEBUG_TAG)
    float u_to_sci = curand_uniform(&rng) ;  // purely for alignment with G4
    float u_to_bnd = curand_uniform(&rng) ;  // purely for alignment with G4
#endif
    float u_scattering = curand_uniform(&rng) ;
    float u_absorption = curand_uniform(&rng) ;

#if !defined(PRODUCTION) && defined(DEBUG_TAG)
    stagr& tagr = ctx.tagr ;
    tagr.add( stag_to_sci, u_to_sci);
    tagr.add( stag_to_bnd, u_to_bnd);
    tagr.add( stag_to_sca, u_scattering);
    tagr.add( stag_to_abs, u_absorption);
#endif


#if !defined(PRODUCTION) && defined(DEBUG_LOGF)
    // see notes/issues/U4LogTest_maybe_replacing_G4Log_G4UniformRand_in_Absorption_and_Scattering_with_float_version_will_avoid_deviations.rst
    float scattering_distance = -scattering_length*KLUDGE_FASTMATH_LOGF(u_scattering);
    float absorption_distance = -absorption_length*KLUDGE_FASTMATH_LOGF(u_absorption);
#else
    float scattering_distance = -scattering_length*logf(u_scattering);
    float absorption_distance = -absorption_length*logf(u_absorption);
#endif

#if !defined(PRODUCTION) && defined(DEBUG_PIDX)

    if(ctx.pidx == base->pidx)
    {
    printf("//qsim.propagate_to_boundary.head pidx %7lld : u_absorption %10.8f logf(u_absorption) %10.8f absorption_length %10.4f absorption_distance %10.6f \n",
        ctx.pidx, u_absorption, logf(u_absorption), absorption_length, absorption_distance );

    printf("//qsim.propagate_to_boundary.head pidx %7lld : post = np.array([%10.5f,%10.5f,%10.5f,%10.5f]) \n",
        ctx.pidx, p.pos.x, p.pos.y, p.pos.z, p.time );

    printf("//qsim.propagate_to_boundary.head pidx %7lld : distance_to_boundary %10.4f absorption_distance %10.4f scattering_distance %10.4f \n",
             ctx.pidx, distance_to_boundary, absorption_distance, scattering_distance );

    printf("//qsim.propagate_to_boundary.head pidx %7lld : u_scattering %10.4f u_absorption %10.4f \n",
             ctx.pidx, u_scattering, u_absorption  );

    }
#endif


    if (absorption_distance <= scattering_distance)
    {
        if (absorption_distance <= distance_to_boundary)
        {
            p.time += absorption_distance/group_velocity ;
            p.pos  += absorption_distance*(p.mom) ;


#if !defined(PRODUCTION) && defined(DEBUG_PIDX)
            float absorb_time_delta = absorption_distance/group_velocity ;
            if( ctx.pidx == base->pidx )
            {
            printf("//qsim.propagate_to_boundary.body.BULK_ABSORB pidx %7lld : post = np.array([%10.5f,%10.5f,%10.5f,%10.5f]) ; absorb_time_delta = %10.8f   \n",
                    ctx.pidx, p.pos.x, p.pos.y, p.pos.z, p.time, absorb_time_delta  );

            }
#endif

            float u_reemit = reemission_prob == 0.f ? 2.f : curand_uniform(&rng);  // avoid consumption at absorption when not scintillator


#if !defined(PRODUCTION) && defined(DEBUG_TAG)
            if( u_reemit != 2.f ) tagr.add( stag_to_ree, u_reemit) ;
#endif


            if (u_reemit < reemission_prob)
            {
                float u_re_wavelength = curand_uniform(&rng);
                float u_re_mom_ph = curand_uniform(&rng);
                float u_re_mom_ct = curand_uniform(&rng);
                float u_re_pol_ph = curand_uniform(&rng);
                float u_re_pol_ct = curand_uniform(&rng);

                p.wavelength = scint->wavelength(u_re_wavelength);
                p.mom = uniform_sphere(u_re_mom_ph, u_re_mom_ct);
                p.pol = normalize(cross(uniform_sphere(u_re_pol_ph, u_re_pol_ct), p.mom));

#if !defined(PRODUCTION) && defined(DEBUG_TAG)
                tagr.add( stag_re_wl, u_re_wavelength);
                tagr.add( stag_re_mom_ph, u_re_mom_ph);
                tagr.add( stag_re_mom_ct, u_re_mom_ct);
                tagr.add( stag_re_pol_ph, u_re_pol_ph);
                tagr.add( stag_re_pol_ct, u_re_pol_ct);
#endif

                flag = BULK_REEMIT ;
                return CONTINUE;
            }
            else
            {
                flag = BULK_ABSORB ;
                return BREAK;
            }
        }
        //  otherwise sail to boundary
    }
    else
    {
        if (scattering_distance <= distance_to_boundary)
        {
            p.time += scattering_distance/group_velocity ;
            p.pos  += scattering_distance*(p.mom) ;

            rayleigh_scatter(rng, ctx);  // changes dir and pol, consumes 5u at each turn of rejection sampling loop

            flag = BULK_SCATTER;

            return CONTINUE;
        }
          //  otherwise sail to boundary
    }     // if scattering_distance < absorption_distance


    p.pos  += distance_to_boundary*(p.mom) ;
    p.time += distance_to_boundary/group_velocity   ;

#if !defined(PRODUCTION) && defined(DEBUG_PIDX)
    float sail_time_delta = distance_to_boundary/group_velocity ;
    if( ctx.pidx == base->pidx ) printf("//qsim.propagate_to_boundary.tail.SAIL pidx %7lld : post = np.array([%10.5f,%10.5f,%10.5f,%10.5f]) ;  sail_time_delta = %10.5f   \n",
          ctx.pidx, p.pos.x, p.pos.y, p.pos.z, p.time, sail_time_delta  );
#endif

    return BOUNDARY ;
}
#endif
#if defined(__CUDACC__) || defined(__CUDABE__) || defined( MOCK_CURAND ) || defined(MOCK_CUDA)
/**
qsim::propagate_at_boundary
------------------------------------------

This was brought over from oxrap/cu/propagate.h:propagate_at_boundary_geant4_style
See env-/g4op-/G4OpBoundaryProcess.cc annotations to follow this
and compare the Opticks and Geant4 implementations.

Input:

* p.direction
* p.polarization
* s.material1.x    : refractive index
* s.material2.x    : refractive index
* prd.normal

Consumes one random deciding between BOUNDARY_REFLECT and BOUNDARY_TRANSMIT

+---------------------+------------------+-------------------------------+----------+
| output flag         |   command        |  changed                      |  note    |
+=====================+==================+===============================+==========+
|   BOUNDARY_REFLECT  |    -             |  direction, polarization      |          |
+---------------------+------------------+-------------------------------+----------+
|   BOUNDARY_TRANSMIT |    -             |  direction, polarization      |          |
+---------------------+------------------+-------------------------------+----------+

Notes:

* when geometry and refractive indices dictates TIR there is no dependence on u_reflect and always get reflection


::

                    s1
                  +----+
                   \   .   /      ^
              c1   i\  .  / r    /|\
                     \ . /        |
        material1     \./         | n
        ---------------+----------+----------
        material2      .\
                       . \
                  c2   .  \ t
                       .   \
                       +----+
                         s2

Snells law::

     s1    n2
    --- = ---         s1 n1 = s2 n2         eta = n1/n2     s1 eta = s2
     s2    n1

     s1.s1 = 1 - c1.c1   # trig identity

     s2.s2 = 1 - c2.c2

    s1 eta = s2          # snell

    s1s1 eta eta = s2s2

    ( 1.f - c1c1 ) eta eta = 1.f - c2c2

     c2c2 = 1.f - eta eta ( 1.f - c1c1 )    # snell and trig identity

Because the electromagnetic wave is transverse, the field incident onto the
interface can be decomposed into two polarization components, one P-polarized,
i.e., with the electric field vector inside the plane of incidence, and the
other one S-polarized, i.e., orthogonal to that plane.


inconsistent normal definitions, c1 is expected to be +ve and normal needs to be oriented against initial direction
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

This is apparent from reflected direction vector::

      *direction + 2.0f*c1*surface_normal

The standard normal vector at an intersection position on the surface of a shape
is defined to be rigidly oriented outwards away from the shape.
This definition is used by *fill_state* in order to determine properties
of this material m1 and the next material m2 on the other side of the boundary.

The below math assumes that the photon direction is always against the normal
such that the sign of c1 is +ve. Having -ve c1 leads to non-sensical -ve TranCoeff
which results in always relecting.

So what about photons going in the other direction ?
Surface normal is used in several places in the below so presumably must
arrange to have an oriented normal that is flipped appropriately OR perhaps can change the math ?

In summary this is a case of inconsistent definitions of the normal,
that will need to be oriented ~half the time.

TODO: try avoiding "float3 oriented_normal" instead just use "bool orient"
      and multiply prd.normal by 1.f or -1.f depending on orient at every use


random aligned matching with examples/Geant/BoundaryStandalone
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

* S/P/"X"-polarized + TIR + normal_incidence all now matching
* noted a 1-in-a-million deviant from TransCoeff cut edge float/double for S and P

* initially had two more deviants at very close to normal incidence that were aligned by changing
  the criteria to match Geant4 "sint1 == 0." better::

    //const bool normal_incidence = fabs(c1) > 0.999999f ;
    const bool normal_incidence = fabs(c1) == 1.f ;

* see notes/issues/QSimTest_propagate_at_boundary_vs_BoundaryStandalone_G4OpBoundaryProcessTest.rst


**Normal Incidence Special Case**

Judging normal_incidence based on absolete dot product being exactly unity "c1 == 1.f" is problematic
as when very near to normal incidence there are vectors for which the absolute dot product
is not quite 1.f but the cross product does give an exactly zero vector which gives
A_trans (nan, nan, nan) from the normalize doing : (zero,zero,zero)/zero.

Solution is to judge normal incidence based on trans_length as that is what the
calculation actually needs to be non-zero in order to be able to normalize trans to give A_trans.

However using "bool normal_incidence = trans_length == 0.f" also problematic
as it means would be using very small trans vectors to define A_trans and this
would cause a difference with double precision Geant4 and float precision Opticks.
So try using a cutoff "trans_length < 1e-6f" below which to special case normal
incidence.

**/

inline QSIM_METHOD int qsim::propagate_at_boundary(unsigned& flag, RNG& rng, sctx& ctx, float theTransmittance ) const
{
#if !defined(PRODUCTION) && defined(DEBUG_PIDX)