-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathenvutil_payload.cc
2063 lines (1687 loc) · 73.6 KB
/
envutil_payload.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/************************************************************************/
/* */
/* utility to convert and extract images from 360 degree environments */
/* */
/* Copyright 2024 by Kay F. Jahnke */
/* */
/* The git repository for this software is at */
/* */
/* https://github.com/kfjahnke/envutil */
/* */
/* Please direct questions, bug reports, and contributions to */
/* */
/* kfjahnke+envutil@gmail.com */
/* */
/* Permission is hereby granted, free of charge, to any person */
/* obtaining a copy of this software and associated documentation */
/* files (the "Software"), to deal in the Software without */
/* restriction, including without limitation the rights to use, */
/* copy, modify, merge, publish, distribute, sublicense, and/or */
/* sell copies of the Software, and to permit persons to whom the */
/* Software is furnished to do so, subject to the following */
/* conditions: */
/* */
/* The above copyright notice and this permission notice shall be */
/* included in all copies or substantial portions of the */
/* Software. */
/* */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */
/* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */
/* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */
/* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */
/* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */
/* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */
/* OTHER DEALINGS IN THE SOFTWARE. */
/* */
/************************************************************************/
#include "zimt/common.h"
#ifdef MULTI_SIMD_ISA
// This header defines all the macros having to do with targets:
#include <hwy/detect_targets.h>
// glean the target as 'TG_ISA' from outside - this file is intended
// to produce ISA-specific separate TUs containing only binary for
// one given ISA, but assuming that other files of similar structure,
// but for different ISAs will also be made and all linked together
// with more code which actually makes use of the single-ISA TUs.
// 'Slotting in' the target ISA from the build system is enough to
// produce a SIMD-ISA-specific TU - all the needed specifics are
// derived from this single information. detect_targets.h sets
// HWY_TARGET to HWY_STATIC_TARGET, so we #undef it and use the
// target specification from outside instead.
#undef HWY_TARGET
#define HWY_TARGET TG_ISA
// now we #include highway.h - as we would do after foreach_target.h
// in a multi-ISA build. With foreach_target.h, the code is re-included
// several times, each time with a different ISA. Here we have set one
// specific ISA and there won't be any re-includes.
#include <hwy/highway.h>
#else // MULTI_SIMD_ISA
#define HWY_TARGET TG_ISA
#endif // MULTI_SIMD_ISA
HWY_BEFORE_NAMESPACE() ;
// To conveniently rotate with a rotational quaternion, we employ
// Imath's 'Quat' data type, packaged in a zimt::unary_functor.
// This is not factored out because it requires inclusion of
// some Imath headers, which I want to keep out of the other
// code, e.g. in geometry.h, where it would fit in nicely.
#include <Imath/ImathVec.h>
#include <Imath/ImathEuler.h>
#include <Imath/ImathQuat.h>
#include <Imath/ImathLine.h>
// we #include xel.h in the ISA-specific namespace. It is guarded
// by a conventional sentinel, so further re-inclusions have no
// effect and the ISA environment is fixed for the entire TU.
// same for array.h.
#include "zimt/xel.h"
#include "zimt/array.h"
HWY_AFTER_NAMESPACE() ;
// this asset_t object is a handle to the current 'enviromnet'
// providing image data
static zimt::asset_t current_env ;
// envutil_dispatch.h has the definition of class dispatch_base and
// of the function get_dispatch.
#include "zimt/zimt.h"
#include "envutil_dispatch.h"
// environment.h has most of the 'workhorse' code for this program.
// it provides functional constructs to yield pixels for coordinates.
// These functors are used with zimt::process to populate the output.
#include "environment.h"
#include "stepper.h"
#include "lens_correction.h"
// now we invoke HWY_BEFORE_NAMESPACE() for this file, to make code
// down to HWY_AFTER_NAMESPACE() compile with settings for a specific
// target (via e.g. #pragmas to the compiler)
HWY_BEFORE_NAMESPACE() ;
BEGIN_ZIMT_SIMD_NAMESPACE(project)
// rotate_3d uses a SIMDized Imath Quaternion to affect a 3D rotation
// of a 3D SIMDized coordinate. Imath::Quat<float> can't broadcast
// to handle SIMDized input, but if we use an Imath::Quat of the
// SIMDized type, we get the desired effect for simdized input -
// hence the broadcasting to Imath::Quat<U> in 'eval', which
// has no effect for scalars, but represents a broadcast of the
// rotation to all lanes of the simdized quaternion's components.
// We'll use this functor to compare the output of steppers with
// built-in rotation to unrotated steppers with a subsequent
// rotation of the resulting 3D ray.
template < typename T , std::size_t L >
struct rotate_3d
: public zimt::unary_functor
< zimt::xel_t < T , 3 > , zimt::xel_t < T , 3 > , L >
{
typedef zimt::simdized_type < T , L > f_v ;
typedef zimt::xel_t < T , 3 > crd3_t ;
typedef zimt::simdized_type < crd3_t , L > crd3_v ;
Imath::Quat < T > q ;
rotate_3d ( T roll = 0 , T pitch = 0 , T yaw = 0 , bool inverse = false )
{
// set up the rotational quaternion. if 'inverse' is set, produce
// the conjugate.
Imath::Eulerf angles ( roll , pitch , yaw , Imath::Eulerf::ZXY ) ;
q = angles.toQuat() ;
if ( inverse )
q.invert() ;
}
// eval applies the quaternion. Note how we use a template of
// typename U for the formulation. This way, we can handle both
// scalar and simdized arguments.
template < typename U >
void eval ( const zimt::xel_t < U , 3 > & in ,
zimt::xel_t < U , 3 > & out ) const
{
// using highway's foreach_target mechanism, coding the quaternion
// multiplication by invoking Imath's operator* with an Imath::Quat
// argument, produces slow code compared to single-ISA compiles.
// I work around this problem by coding the operation 'manually'.
// Code using Imath's operator* would do this:
//
auto const & in_e
= reinterpret_cast < const Imath::Vec3 < U > & > ( in ) ;
auto & out_e
= reinterpret_cast < Imath::Vec3 < U > & > ( out ) ;
out_e = in_e * Imath::Quat < U > ( q ) ;
}
// for convenience:
template < typename U >
zimt::xel_t < U , 3 > operator() ( const zimt::xel_t < U , 3 > & in )
{
zimt::xel_t < U , 3 > out ;
eval ( in , out ) ;
return out ;
}
template < typename U >
void as_matrix ( r3_t < U > & m )
{
xel_t < double , 3 > ex { 1.0 , 0.0 , 0.0 } ;
xel_t < double , 3 > ey { 0.0 , 1.0 , 0.0 } ;
xel_t < double , 3 > ez { 0.0 , 0.0 , 1.0 } ;
eval ( ex , ex ) ;
eval ( ey , ey ) ;
eval ( ez , ez ) ;
m[0] = ex ;
m[1] = ey ;
m[2] = ez ;
}
} ;
// directly produce an r3_t from Euler angles. This might also
// be done directly with a very large formula - this is easier:
template < typename T >
r3_t < T > make_r3_t ( T r , T p , T y ,
bool inverse = false )
{
rotate_3d < T , 1 > rq3 ( r , p , y , inverse ) ;
r3_t < T > r3m ;
rq3.as_matrix ( r3m ) ;
return r3m ;
}
// 'work' is the function where the state we've built up in the
// code below it culminates in the call to zimt::process, which
// obtains pick-up coordinates from the 'get' object, produces
// pixels for these coordinates with the 'act' object and stores
// these pixels with a zimt::storer, which is set up in this
// function. All the specificity of the code has now moved to
// the types get_t and act_t - the roll_out is complete and we
// can code a single 'work' template which is good for all
// variants.
template < typename get_t , typename act_t >
void work ( get_t & get , act_t & act )
{
typedef typename act_t::out_type px_t ;
static const size_t nchannels = px_t::size() ;
// we have the functors set up. now we set up an array to
// receive the output pixels. Again we use a static object:
// the output size will remain the same, so the array can be
// re-used every time and we don't have to deallocate and then
// reallocate the memory. If there is a cropping specification
// for the output image (from a PTO file's p-line), the target
// array is sized accordingly.
std::size_t w , h ;
if ( args.store_cropped )
{
w = args.p_crop_x1 - args.p_crop_x0 ;
h = args.p_crop_y1 - args.p_crop_y0 ;
}
else
{
w = args.width ;
h = args.height ;
}
zimt::array_t < 2 , px_t > trg ( { w , h } ) ;
// set up a zimt::storer to populate the target array with
// zimt::process. This is the third component needed for
// zimt::process - we already have the get_t and act.
zimt::storer < float , nchannels , 2 , LANES > cstor ( trg ) ;
// use the get, act and put components with zimt::process
// to produce the target images. This is the point where all
// the state we have built up is finally put to use, running
// a multithreaded pipeline which fills the target image.
zimt::bill_t bill ;
// If there is a cropping specification for the output image
// (from a PTO file's p-line), the discrete coordinates fed into
// the pixel pipeline have to be raised appropriately:
if ( args.store_cropped )
{
bill.get_offset.push_back ( args.p_crop_x0 ) ;
bill.get_offset.push_back ( args.p_crop_y0 ) ;
}
std::chrono::system_clock::time_point start
= std::chrono::system_clock::now() ;
// bill.njobs = 1 ; // to run the rendering single-threaded
float unbrighten = 1.0f ;
if ( args.single >= 0 )
{
const auto & fct ( args.facet_spec_v [ args.single ] ) ;
if ( fct.brighten != 1.0 )
{
unbrighten = 1.0 / fct.brighten ;
}
}
if ( unbrighten != 1.0f )
{
// for 'single' jobs, undo the target facet's 'brighten'
// TODO tentative. linear vs. sRGB is still unresolved!
// Where 'brighten' is used now - so, here, and 'on the
// other end of the pipeline' - a whole complex of processing
// needs to be 'slotted in' to handle e.g. colour space
// conversions.
px_t amplify = unbrighten ;
if ( nchannels == 2 || nchannels == 4 )
amplify [ nchannels - 1 ] = 1.0 ;
amplify_type < px_t , px_t , px_t , LANES > amp ( amplify ) ;
zimt::process ( trg.shape , get , act + amp , cstor , bill ) ;
}
else
{
zimt::process ( trg.shape , get , act , cstor , bill ) ;
}
std::chrono::system_clock::time_point end
= std::chrono::system_clock::now() ;
auto msec = std::chrono::duration_cast<std::chrono::milliseconds>
( end - start ) . count() ;
if ( args.verbose )
{
std::cout << "frame rendering time: " << msec << " ms" << std::endl ;
}
rt_cumulated += msec ;
// store the result to disk - either as a single frame added
// to the video file, or as a single image stored individually.
// if ( args.seqfile != std::string() )
// {
// push_video_frame ( trg ) ;
// }
// else
{
if ( args.verbose )
std::cout << "saving output image: " << args.output << std::endl ;
save_array < nchannels > ( args.output , trg ) ;
}
}
// we'll use a common wrapper class for all synopsis-forming objects.
// This class wraps a specific synopsis-forming object handling only
// 'normal' ray coordinates as input and adds an operator() overload
// which handles 'ninepack' arguments and twining, using operator()
// of the 'wrappee' to evaluate individual rays in the twining spread.
template < std::size_t NCH , template < std::size_t > class SF >
struct synopsis_t
: public SF < NCH >
{
typedef SF < NCH > base_t ;
using base_t::sz ;
using base_t::operator() ;
using typename base_t::ray_v ;
using typename base_t::px_v ;
using typename base_t::env_t ;
typedef zimt::xel_t < float , 9 > ninepack_t ;
typedef simdized_type < ninepack_t , LANES > ninepack_v ;
// 'scratch' will hold all ray packets which will react with
// a given twining coefficient 'cf'
std::vector < ray_v > scratch ;
// we hold a copy of the spread here, which is modified in the c'tor
// to incorporate the 'bias' values.
std::vector < zimt::xel_t < float , 3 > > spread ;
synopsis_t ( std::vector < env_t > & env_v ,
const float & bias = 4.0f )
: base_t ( env_v ) ,
scratch ( env_v.size() ) ,
spread ( args.twine_spread )
{
// we apply the 'bias' value to the twining coefficients, to avoid
// the multiplication with the dx/dy value which would be needed
// otherwise. This 'bias' is the reciprocal value of the 'bias_x'
// and 'bias_y' values used in steppers to form slightly
// offsetted planar coordinates for twining.
for ( auto & cf : spread )
{
cf[0] *= bias ;
cf[1] *= bias ;
}
}
// operator() for 'ninepacks'. This implements 'twining': evaluation
// at several rays in the vicinity of the central ray and formation
// of a weighted sum of these evaluations. The incoming 'ninepack'
// holds three concatenated vectorized rays: the 'central' rays,
// the rays which result from a discrete target coordinate set off
// by one to the right, and the rays which result from a discrete
// target coordinate set off one down. By differencing, we obtain
// two vectors representing 'canonical' steps in the target image's
// horizontal and vertical. We use these two vectors as a basis for
// the 'twining' operation: each twining coefficient has two factors
// describing it's spatial aspect, and multiplying them with the
// corresponding basis vectors and summing up produces the offset
// we need to add to the 'central' or 'pickup' ray to obtian a
// 'deflected' ray. The substrate is evaluated at each deflected
// ray, and the results are combined in a weighted sum, where the
// weight comes from the third factor in the twining coefficient.
void operator() ( const std::vector < ninepack_v > & pv ,
px_v & trg ,
const std::size_t & cap = LANES )
{
typedef simdized_type < float , LANES > f_v ;
typedef simdized_type < zimt::xel_t < float , 3 > , LANES > ray_v ;
typedef simdized_type < zimt::xel_t < float , 9 > , LANES > ray9_v;
// trg is built up as a weighted sum of contributing values
// resulting from the reaction with a given coefficient, so we
// clear it befor we begin.
trg = 0.0f ;
// for each coefficient in the twining 'spread'
for ( const auto & cf : spread )
{
for ( std::size_t facet = 0 ; facet < sz ; facet++ )
{
const ray9_v & in ( pv[facet] ) ; // shorthand
ray_v p0 { in[0] , in[1] , in[2] } ;
ray_v du { in[3] - in[0] , in[4] - in[1] , in[5] - in[2] } ;
ray_v dv { in[6] - in[0] , in[7] - in[1] , in[8] - in[2] } ;
scratch [ facet ] = p0 + cf[0] * du + cf[1] * dv ;
}
// now we have sz entries in scratch, and we can call
// the three-component form above to produce a synopsis
// for the contributors reacting with the current coefficient
px_v help ;
base_t::operator() ( scratch , help , cap ) ;
// 'help' is the pixel value, which is weighted with the
// weighting value in the twining kernel and added to what's
// in trg already.
trg += cf[2] * help ;
}
// and that's it - trg has the result.
}
} ;
// this class is an example for giving preference to one specific
// facet - as opposed to code mixing several values. There are two
// member functions. The first one processes a std::vector of simdized
// rays, selecting the rays which are most central (by calculating the
// angle with the 'forward' ray (0,0,1)) and evaluating there. The
// angle can be calculated in this way because the rays we receive
// here are in the facets' coordinate system - the orientations of
// the facets and the virtual camera are encoded in the 'steppers'
// which generate the rays we receive here. The second member function
// receives a std::vector of 'ninepacks' which are the basis for using
// 'twining'. It does in turn call the first member repeatedly - once
// for each of the twining kernel's coefficients. Note how this softens
// the hard discontinuity between 'colliding' facets: each of the
// component values of the weighted sum which is formed by the twining
// operations may stem from a *different* facet, so near the border,
// result pixels will contain a *blend* of components from both facets.
// While twining is computationally expensive, this is a strategy to
// avoid the ungainly staircase artifacts where facets collide. The
// final result is like a scaled-down version of a much larger image,
// and the scaling-down would have the same effect of blending pixels
// from colliding facets where pixels from two facets fall into the
// same bin in the scaled-down result.
// The result is a voronoi diagram - apart from the fact that where
// facets don't cover their full share of their tangent plane, other
// facets may be visible. If all facets are sufficiently large and
// there are no gaps, the result is much like a voronoi diagram. envutil
// also allows facets in other projections than rectilinear - with
// such facets, the image center is also the reference but the samples
// are - conceptually - no longer on a tangent plane but on the 'draped'
// 2D manifold - e.g. a sphere for spherically-projected facets. Still,
// the proximity criterion is calculated with 3D rays - but where
// rectilinear facets have straight intersections, non-rectilinear
// ones may have curved intersections. The ray-based proximity criterion
// is slower to compute than the 2D distance-to-center one might also
// use (on target coordinates) - but ray-based is universally applicable
// and produces values which compare across different projections,
// whereas the 2D target coordinate varies with the projection used.
// The code in voronoi_syn is only suitable for fully-opaque images:
// to handle data with transparency, figuring out a single 'winner'
// is not sufficient, because the winning pixel might be partially or
// fully transparent, in which case worse-ranking pixels should 'shine
// through'. To achieve this, z-buffering of the data is needed - like
// it is done in lux (see pv_rendering.cc, class 'multi_facet_type_4').
// class voronoi_syn_plus implements handling of transparent facets
// with alpha compositing in z-buffer sequence, see there!
// The lux code also shows that even with transparency, a lot of
// potential shortcuts can save time and not every pixel needs the
// 'full treatment'. lux also employs facet pre-selection, inspecting
// each facet's frustum and it's intersection with the target image's.
// This process of 'culling' can greatly reduce computational load if
// the current view only involves a small amount of facets. When
// stitching panoramas, though, it's often the case that all facets are
// involved, so the frustum inspection code does not help. In envutil,
// I intend to use a different method: coarse masking and tiling. The
// idea is to use a coarse mask of the intersection of each facet with
// the target image stored in a cubemap (preferably biatan6 type) with
// relatively low resolution which coincides with a tiling of the target
// image so that each pixel in the coarse mask is 'as large' as a tile
// of the target image. If, then, the target image is built tile by tile,
// the participating facets can be found by evaluating their corresponding
// coarse masks and only picking those which aren't zero at the given
// location. A biatan6-projected cubemap is quite close to ideal when it
// comes to evenness of represented spherical 'real estate' per pixel
// of the map. The operation would be more fine-grained compared to the
// frustum-based approach.
// This aside, here goes: this is the code for fully opaque facets:
template < std::size_t NCH >
struct _voronoi_syn
{
typedef environment < float , float , NCH , LANES > env_t ;
typedef typename env_t::px_t px_t ;
typedef simdized_type < px_t , LANES > px_v ;
static const std::size_t nch = px_t::size() ;
const int sz ;
typedef zimt::xel_t < float , 3 > ray_t ;
typedef simdized_type < float , LANES > f_v ;
typedef simdized_type < ray_t , LANES > ray_v ;
std::vector < env_t > & env_v ;
_voronoi_syn ( std::vector < env_t > & _env_v )
: env_v ( _env_v ) ,
sz ( _env_v.size() )
{ }
// general calculation of the angles between rays:
//
// f_v angle ( const ray_v & a , const ray_v & b ) const
// {
// auto dot = ( a * b ) . sum() ;
// auto costheta = dot / ( norm ( a ) * norm ( b ) ) ;
// return acos ( costheta ) ;
// }
// this helper function calculates the vector of angles between
// a vector of rays and (0,0,1) - unit forward in our CS.
// TODO: many steppers can be modified to produce normalized rays
// without additional computational cost, and even if there is some
// cost, it should not amount to more than the division by the norm
// which we use here to obtain the angle. With incoming normalized
// rays, it would be sufficient to look at the z component of the
// incoming ray - due to the pre-rotation, z will fall from +1 with
// true forward rays to -1 to rays pointing straigt back. So the
// facet where the ray has the largest z coordinate is the winner.
// ----> done.
// .. but: normalizing the rays destroys some information, which
// is needed e.g. when applying translation parameters (or so it
// seems). Most steppers do produce normalized rays, so for these
// the max_z or angle_against_001 criterion could be obtained
// without calculation from the z coordinate. Only steppers where
// the ray isn't automatically normalized would need a caclulation.
// Maybe the value can be produced by the stepper?
// f_v angle_against_001 ( const ray_v & a ) const
// {
// return acos ( a[2] / norm ( a ) ) ;
// }
// this is the operator() overload for incoming ray data.
void operator() (
const std::vector < ray_v > & pv ,
px_v & trg ,
const std::size_t & cap = LANES ) const
{
// this vector will contain the index of the 'most suitable'
// facet at the given location, according to the criterion.
simdized_type < int , LANES > champion_v ( -1 ) ;
// we initialize 'max_z' to a very small value - no
// 'real' z value can ever be this small.
simdized_type < float , LANES >
max_z ( std::numeric_limits<float>::lowest() ) ;
// next_best starts out with an invalid facet index, and if this
// isn't updated during processing, it serves as an indicator
// that no facet is hit by any ray - a special case which can
// be dealt with very efficiently: produce 0000
int next_best = -1 ;
// test which of the rays in pv[0] pass through facet 0
auto valid = env_v[0].get_mask ( pv[0] ) ;
// do any of the rays hit the facet? then update 'next_best',
// which, at this point, is still -1, indicating 'all misses'.
// This can't be the case any more. Where rays hit this first
// facets (indicated by 'valid'), we now have a champion, which
// we save in champion_v. We also update max_z: we overwrite
// the initial 'impossibly small' value with a real z value
// where the rays hit the facet.
if ( any_of ( valid ) )
{
next_best = 0 ;
champion_v ( valid ) = 0 ;
max_z ( valid ) = pv[0][2] * env_v[0].recip_step ;
}
// now go through the facets in turn and replace the current
// champions where the current facet provides better candidates
for ( int i = 1 ; i < sz ; i++ )
{
// find the mask for facet i - if no rays hit the facet,
// go to the next loop iteration straight away
valid = env_v[i].get_mask ( pv[i] ) ;
if ( none_of ( valid ) )
continue ;
// we initialize 'max_z' with an impossibly small value
// and 'patch in' z values where rays hit the facet.
// 'max_z' encodes our 'quality' criterion: the largest z
// we find is the best.
simdized_type < float , LANES >
current_z ( std::numeric_limits<float>::lowest() ) ;
current_z ( valid ) = pv[i][2] * env_v[i].recip_step ;
// now we test whether current z values are larger than the
// largest we've seen so far
auto mask = ( current_z > max_z ) ;
// are any current z larger? then update 'next_best',
// which, at this point, may still be -1, or already a value
// set by a previous loop iteration. Also update max_z,
// and, finally, update the vector of champions where new
// maximal z values were found to the index of the facet we're
// looking at now.
if ( any_of ( mask ) )
{
next_best = i ;
max_z ( mask ) = current_z ;
champion_v ( mask ) = i ;
}
}
// first check: if next_best is still -1, all facets were missed
// and the result is 0000. This is the fastest outcome, and if
// part of the output isn't covered by any facets, this will
// save a good deal of cycles.
if ( next_best == -1 )
{
trg = 0.0f ;
}
// now check: if all occupants of 'champion_v' are equal,
// we can special-case. Since this is the most common case,
// this is beneficial for performance. Note that we don't
// check the contents of champion_v for equality - next_best
// certainly holds a facet index to a facet which *is* hit,
// (it isn't -1, so it is set to a 'hit' facet's index)
// and if all champions are equal, they must be equal to this
// one, which is the only one in-play in this special case.
else if ( all_of ( champion_v == next_best ) )
{
env_v [ next_best ] . eval ( pv [ next_best ] , trg ) ;
}
else
{
// champion_v's occupants are not all equal, so we need to
// evaluate every in-play facet and compose the result.
// some rays may not have hit any facet, so we clear trg
// first. We refrain from testing for occupants with value
// -1, because forming a mask and performing a masked assignment
// is more costly then clearing trg. this is also the 'worst
// case' - preformance-wise - where the rays hit several facets.
// Luckily, this is also usually quite rare - it only occurs
// where facets collide or near facet edges.
trg = 0.0f ;
for ( int i = 0 ; i < sz ; i++ )
{
// form a mask which is true where the champion is i.
auto mask = ( champion_v == i ) ;
// if this mask has occupants, we need to obtain pixel data
// for this facet and transfer them to the corresponding lanes.
if ( any_of ( mask ) )
{
px_v help ;
env_v [ i ] . eval ( pv [ i ] , help ) ;
trg ( mask ) = help ;
}
}
}
}
} ;
template < std::size_t NCH >
using voronoi_syn = synopsis_t < NCH , _voronoi_syn > ;
// variant with z-buffering for facets with alpha channel:
template < std::size_t NCH , bool fix_duv = true >
struct _voronoi_syn_plus
{
typedef environment < float , float , NCH , LANES > env_t ;
typedef typename env_t::px_t px_t ;
typedef simdized_type < px_t , LANES > px_v ;
static const std::size_t nch = px_t::size() ;
const int sz ;
typedef zimt::xel_t < float , 3 > ray_t ;
typedef simdized_type < float , LANES > f_v ;
typedef simdized_type < ray_t , LANES > ray_v ;
typedef simdized_type < int , LANES > index_v ;
std::vector < env_t > & env_v ;
_voronoi_syn_plus ( std::vector < env_t > & _env_v )
: env_v ( _env_v ) ,
sz ( _env_v.size() )
{ }
// helper function to swap occupants in two vectorized objects
// where a mask is true.
template < typename VT , typename M >
static void masked_swap ( VT & a , VT & b , const M & mask )
{
auto help = a ;
a ( mask ) = b ;
b ( mask ) = help ;
}
// this is the operator() overload for incoming ray data. The next
// one down is for 'ninepacks'.
void operator() (
const std::vector < ray_v > & pv ,
px_v & trg ,
const std::size_t & cap = LANES ) const
{
// Here we have a set of sz index vectors to encode the
// z-buffering. We'll use a 'trickle-up sort', and when
// we're done, the first index vector should hold just the
// same 'champions' as the single champion vector in class
// voronoi_syn. The next one will hold next-best indices
// where the 'maximal z' criterion came out smaller, and
// so forth down to the 'worst' indices. We also need to
// keep track of the z values: that's in max_z.
std::vector < index_v > champion_v ( sz ) ;
std::vector < f_v > max_z ( sz ) ;
// next_best starts out with an invalid facet index, and if this
// isn't updated during processing, it serves as an indicator
// that no facet is hit by any ray - a special case which can
// be dealt with very efficiently: produce 0000
// 'layers' indicate how many layers we have to alpha-composit
// to form the final result.
int next_best = -1 ;
int layers = 0 ;
// we initialize max_z[0] to a very small value - no real
// z value can ever be this small. This is so that when
// this z value is compared to a real z, it will always
// come out smaller.
max_z[0] = std::numeric_limits<float>::lowest() ;
// Initially, we have no valid 'champions'
champion_v[0] = -1 ;
// test which of the rays in pv[0] pass through facet 0
auto valid = env_v[0].get_mask ( pv[0] ) ;
// do any of the rays hit the facet? then update 'next_best',
// which, at this point, is still -1, indicating 'all misses'.
// This can't be the case any more. We now have a first layer.
// Also update max_z[0]: overwrite the initial 'impossible'
// value with a true z value wherever the rays hit the facet.
if ( any_of ( valid ) )
{
next_best = 0 ;
layers = 1 ;
champion_v[0] ( valid ) = 0 ;
max_z[0] ( valid ) = pv[0][2] * env_v[0].recip_step ;
}
// now go through the other facets in turn and perform the
// 'trickle-up' with values for each current facet. This is
// computationally intensive - the worst case it that we 'see'
// the lowest angles last and they have to 'trickle up' all
// the way. Schemes to pre-order the facets so that the most
// likely candidate is processed first can reduce the need for
// 'trickling up' - if the values come in in sorted order,
// there's no need to sort them.
for ( int i = 1 ; i < sz ; i++ )
{
// find the mask for facet i - if no rays hit the facet,
// go to the next loop iteration straight away
valid = env_v[i].get_mask ( pv[i] ) ;
if ( none_of ( valid ) )
continue ;
// we let 'next_best' 'tag along.
next_best = i ;
// we initialize 'current_z' with an impossibly small value
// and 'patch in' z values where rays hit the facet.
// the z value encodes our 'quality' criterion: the
// largest z we find is the best.
auto & current_z ( max_z [ layers ] ) ;
auto & current_champion ( champion_v [ layers ] ) ;
current_z = std::numeric_limits<float>::lowest() ;
current_z ( valid ) = pv[i][2] * env_v[i].recip_step ;
current_champion = -1 ;
current_champion ( valid ) = i ;
// now the trickle-up
for ( std::size_t l = layers ; l > 0 ; --l )
{
// are any z values in layer l larger than in layer l-1?
auto mask = ( max_z [ l ] > max_z [ l - 1 ] ) ;
// if not, everything is in sorted order, we can leave
// the loop prematurely, because all layers with lower
// indices are already sorted.
if ( none_of ( mask ) )
break ;
// if yes, swap these larger z values so that they move
// to level l - 1, and do the same for the indices. This
// is the 'trickle-up' - if a very large z came in
// in some lane, the trickle-up might take it 'through'
// all layers which were processed already, until it
// 'hits the ceiling'.
masked_swap ( max_z [ l ] , max_z [ l - 1 ] , mask ) ;
masked_swap ( champion_v [ l ] , champion_v [ l - 1 ] , mask ) ;
}
// we need to update 'layers'
++layers ;
}
// first check: if 'layers'st is still 0, all facets were missed
// and the result is 0000. This is the fastest outcome, and if
// part of the output isn't covered by any facets, this will
// save a good deal of cycles.
trg = 0.0f ;
if ( layers == 0 )
{
return ;
}
if ( all_of ( champion_v[0] == next_best ) )
{
// a very common scenario which we special-case
px_v help ;
env_v [ next_best ] . eval ( pv [ next_best ] , help ) ;
const auto & alpha ( help [ nch - 1 ] ) ;
if ( all_of ( alpha >= 1.0f ) )
{
// fully opaque homogeneous top layer. we're done:
trg = help ;
return ;
}
}
// evaluate all facets. let's assume we have associated alpha.
// We assume that the last channel in each pixel holds an alpha
// value in the range of zero to one.
// It's enough to evaluate the facets which have actually yielded
// contributions, but we'd have to save and process that information.
std::vector < px_v > lv ( sz ) ;
for ( std::size_t i = 0 ; i < sz ; i++ )
{
env_v [ i ] . eval ( pv [ i ] , lv [ i ] ) ;
// assert ( all_of ( lv [ i ] [ nch - 1 ] <= 1.0f ) ) ;
// if we had unassociated alpha incoming:
// for ( int ch = 0 ; ch < ( nch - 1 ) ; ch++ )
// lv [ i ] [ ch ] *= lv [ i ] [ nch - 1 ] ;
}
// now we have extracted the information in pv and the
// corresponding pixel data from env_v, and we can work
// our way through the layers to form a composite result.
for ( std::size_t i = 0 ; i < layers ; i++ )
{
// at the current layer, which are the 'champions'?
auto indexes = champion_v [ i ] ;
// we're only interested in 'facet indices proper', not
// in any lanes which hold -1 to indicate there are no
// contributions in this layer.
auto mask = ( champion_v[i] != -1 ) ;
if ( none_of ( mask ) )
continue ;
// we convert the vector of 'champion' indices into a set
// of indexes for gathering from the pixel values in lv.
indexes *= int ( LANES * nch ) ;
indexes += index_v::iota() ;
// we gather into 'help' - after the gathering is complete,
// help will hold a composite, where each lane is set to the
// corresponding value in that pixel vector which is the
// champion for that lane.
px_v help ;
float * pf = (float*) ( & ( lv[0] ) ) ;
for ( int ch = 0 ; ch < nch ; ch++ )
help[ch].gather ( pf , indexes + ( ch * LANES ) ) ;
// if we're processing the top layer, we simply copy 'help'
// - omitting lanes which don't comntribute
if ( i == 0 )
{
trg ( mask ) = help ;
}
// subsequent layers are alpha-composited onto the result
// we have so far. Note that the alpha-compositing is done
// with associated alpha, which makes it simple.
else
{
for ( int ch = 0 ; ch < nch ; ch++ )
trg[ch] ( mask ) += ( 1.0f - trg[nch-1] ) * help[ch] ;
}
// // if the result so far is already completely opaque, we
// // can ignore the layers we haven't yet processed - they
// // could not possibly contribute anything. (detrimental)
//
// if ( all_of ( trg[nch-1] >= 1.0f ) )
// break ;
}
// for unassociated alpha, we'd now de-associate
// for ( int ch = 0 ; ch < ( nch - 1 ) ; ch++ )
// {
// trg[ch] /= trg[nch-1] ;
// trg[ch] ( trg[nch-1] == 0.0f ) = 0.0f ;
// }
}
} ;
template < std::size_t NCH >
using voronoi_syn_plus = synopsis_t < NCH , _voronoi_syn_plus > ;