@@ -4,8 +4,10 @@ import (
4
4
"context"
5
5
"database/sql"
6
6
"encoding/json"
7
+ "errors"
7
8
"fmt"
8
9
"log/slog"
10
+ "slices"
9
11
"strings"
10
12
"time"
11
13
@@ -25,10 +27,13 @@ type (
25
27
}
26
28
27
29
DBObject struct {
28
- svcname string
29
- svcID string
30
- clusterID string
31
- availStatus string
30
+ svcname string
31
+ svcID string
32
+ clusterID string
33
+
34
+ DBObjStatus
35
+
36
+ env string
32
37
}
33
38
34
39
DBInstance struct {
72
77
redis * redis.Client
73
78
db DBOperater
74
79
oDb * opensvcDB
80
+ ev EventPublisher
75
81
76
82
nodeID string
77
83
clusterID string
80
86
nodeEnv string
81
87
callerNode * DBNode
82
88
83
- changes map [string ]struct {}
84
- rawData []byte
89
+ changes map [string ]struct {}
90
+ rawChanges string
91
+ rawData []byte
85
92
86
93
data dataProvider
87
94
@@ -97,17 +104,31 @@ type (
97
104
byInstanceName map [string ]* DBInstance
98
105
byInstanceID map [string ]* DBInstance
99
106
}
107
+
108
+ InstanceID struct {
109
+ nodeID string
110
+ svcID string
111
+ }
100
112
)
101
113
114
+ func (n * DBNode ) String () string {
115
+ return fmt .Sprintf ("node: {nodename: %s, node_id: %s, cluster_id: %s, app: %s}" , n .nodename , n .nodeID , n .clusterID , n .app )
116
+ }
117
+
118
+ func (i * InstanceID ) String () string {
119
+ return fmt .Sprintf ("instance id: %s@%s" , i .svcID , i .nodeID )
120
+ }
121
+
102
122
func (t * Worker ) handleDaemonStatus (nodeID string ) error {
103
123
defer logDurationInfo (fmt .Sprintf ("handleDaemonStatus %s with tx %v" , nodeID , t .WithTx ), time .Now ())
104
- slog .Info (fmt .Sprintf ("handleDaemonStatus node_id: %s" , nodeID ))
124
+ slog .Info (fmt .Sprintf ("handleDaemonStatus starting for node_id %s" , nodeID ))
105
125
ctx := context .Background ()
106
126
107
127
d := daemonStatus {
108
128
ctx : ctx ,
109
129
redis : t .Redis ,
110
130
nodeID : nodeID ,
131
+ ev : t .Ev ,
111
132
112
133
changes : make (map [string ]struct {}),
113
134
@@ -156,6 +177,9 @@ func (t *Worker) handleDaemonStatus(nodeID string) error {
156
177
d .dbFindInstance ,
157
178
d .dbUpdateServices ,
158
179
d .dbUpdateInstance ,
180
+ d .dbPurgeInstance ,
181
+ d .dbPurgeService ,
182
+ d .pushFromTableChanges ,
159
183
)
160
184
if err != nil {
161
185
if tx , ok := d .db .(DBTxer ); ok {
@@ -171,8 +195,7 @@ func (t *Worker) handleDaemonStatus(nodeID string) error {
171
195
return fmt .Errorf ("handleDaemonStatus commit: %w" , err )
172
196
}
173
197
}
174
- slog .Info (fmt .Sprintf ("handleDaemonStatus done: node_id: %s cluster_id: %s, cluster_name: %s" ,
175
- d .nodeID , d .clusterID , d .clusterName ))
198
+ slog .Info (fmt .Sprintf ("handleDaemonStatus done for %s" , d .byNodeID [d .nodeID ]))
176
199
for k , v := range d .byNodename {
177
200
slog .Debug (fmt .Sprintf ("found db node %s: %#v" , k , v ))
178
201
}
@@ -213,6 +236,7 @@ func (d *daemonStatus) getChanges() error {
213
236
} else if err != redis .Nil {
214
237
return fmt .Errorf ("getChanges: HGET %s %s: %w" , cache .KeyDaemonStatusChangesHash , d .nodeID , err )
215
238
}
239
+ d .rawChanges = s
216
240
for _ , change := range strings .Fields (s ) {
217
241
d .changes [change ] = struct {}{}
218
242
}
@@ -347,6 +371,7 @@ func (d *daemonStatus) dbFindNodes() error {
347
371
for nodename := range d .byNodename {
348
372
d .nodes = append (d .nodes , nodename )
349
373
}
374
+ slog .Info (fmt .Sprintf ("handleDaemonStatus run details: %s changes: [%s]" , callerNode , d .rawChanges ))
350
375
return nil
351
376
}
352
377
@@ -373,7 +398,7 @@ func (d *daemonStatus) dataToNodeFrozen() error {
373
398
func (d * daemonStatus ) dbFindServices () error {
374
399
defer logDuration ("dbFindServices" , time .Now ())
375
400
const queryFindServicesInfo = "" +
376
- "SELECT svcname, svc_id, cluster_id, svc_availstatus" +
401
+ "SELECT svcname, svc_id, cluster_id, svc_availstatus, svc_env, svc_status, svc_placement, svc_provisioned " +
377
402
" FROM services" +
378
403
" WHERE cluster_id = ? AND svcname IN (?"
379
404
objectNames , err := d .data .objectNames ()
@@ -405,12 +430,13 @@ func (d *daemonStatus) dbFindServices() error {
405
430
}
406
431
defer func () { _ = rows .Close () }()
407
432
for rows .Next () {
408
- var o DBObject
409
- if err := rows .Scan (& o .svcname , & o .svcID , & o .clusterID , & o .availStatus ); err != nil {
433
+ o := DBObject { DBObjStatus : DBObjStatus {}}
434
+ if err := rows .Scan (& o .svcname , & o .svcID , & o .clusterID , & o .availStatus , & o . env , & o . overallStatus , & o . placement , & o . provisioned ); err != nil {
410
435
return fmt .Errorf ("dbFindServices scan %s: %w" , d .nodeID , err )
411
436
}
412
437
d .byObjectName [o .svcname ] = & o
413
438
d .byObjectID [o .svcID ] = & o
439
+ slog .Debug (fmt .Sprintf ("dbFindServices %s (%s)" , o .svcname , o .svcID ))
414
440
}
415
441
if err := rows .Err (); err != nil {
416
442
return fmt .Errorf ("dbFindServices FindClusterNodesInfo %s: %w" , d .nodeID , err )
@@ -449,14 +475,16 @@ func (d *daemonStatus) dbFindInstance() error {
449
475
for rows .Next () {
450
476
var o DBInstance
451
477
if err := rows .Scan (& o .svcID , & o .nodeID , & o .Frozen ); err != nil {
452
- return fmt .Errorf ("dbFindServices scan %s: %w" , d .nodeID , err )
478
+ return fmt .Errorf ("dbFindInstance scan %s: %w" , d .nodeID , err )
453
479
}
454
480
if n , ok := d .byNodeID [o .nodeID ]; ok {
455
481
// Only pickup instances from known nodes
456
482
if s , ok := d .byObjectID [o .svcID ]; ok {
457
483
// Only pickup instances from known objects
458
484
d .byInstanceName [s .svcname + "@" + n .nodename ] = & o
459
485
d .byInstanceID [s .svcID + "@" + n .nodeID ] = & o
486
+ slog .Debug (fmt .Sprintf ("dbFindInstance found %s@%s (%s@%s)" ,
487
+ s .svcname , n .nodename , s .svcID , n .nodeID ))
460
488
}
461
489
}
462
490
}
@@ -521,6 +549,11 @@ func (d *daemonStatus) dbUpdateServices() error {
521
549
if err := d .oDb .updateObjectStatus (d .ctx , objectID , oStatus ); err != nil {
522
550
return fmt .Errorf ("dbUpdateServices can't update object %s %s: %w" , objectName , objectID , err )
523
551
}
552
+ if d .byObjectID [objectID ].availStatus != oStatus .availStatus {
553
+ slog .Debug (fmt .Sprintf ("dbUpdateServices %s avail status %s -> %s" , objectName , d .byObjectID [objectID ].availStatus , oStatus .availStatus ))
554
+ }
555
+ // refresh local cache
556
+ d .byObjectID [objectID ].DBObjStatus = * oStatus
524
557
}
525
558
}
526
559
}
@@ -572,19 +605,21 @@ func (d *daemonStatus) dbUpdateInstance() error {
572
605
return fmt .Errorf ("dbUpdateInstance delete resources %s@%s: %w" , objID , nodeID , err )
573
606
}
574
607
} else {
575
- if err := d .instanceStatusUpdate (objID , nodeID , iStatus ); err != nil {
576
- return fmt .Errorf ("dbUpdateInstance update status %s@%s (%s@%s): %w" , objID , nodeID , objectName , nodename , err )
608
+ // set iStatus svcID and nodeID for db update
609
+ iStatus .svcID = objID
610
+ iStatus .nodeID = nodeID
611
+ if err := d .instanceStatusUpdate (objectName , nodename , iStatus ); err != nil {
612
+ return fmt .Errorf ("dbUpdateInstance update status %s@%s (%s@%s): %w" , objectName , nodename , objID , nodeID , err )
577
613
}
578
614
resourceObsoleteAt := time .Now ()
579
- if err := d .instanceResourceUpdate (objID , nodeID , iStatus ); err != nil {
580
- return fmt .Errorf ("dbUpdateInstance update resource %s@%s (%s@%s): %w" , objID , nodeID , objectName , nodename , err )
615
+ if err := d .instanceResourceUpdate (objectName , nodename , iStatus ); err != nil {
616
+ return fmt .Errorf ("dbUpdateInstance update resource %s@%s (%s@%s): %w" , objectName , nodename , objID , nodeID , err )
581
617
}
582
618
slog .Debug (fmt .Sprintf ("dbUpdateInstance deleting obsolete resources %s@%s" , objectName , nodename ))
583
619
if err := d .oDb .instanceResourcesDeleteObsolete (d .ctx , objID , nodeID , resourceObsoleteAt ); err != nil {
584
620
return fmt .Errorf ("dbUpdateInstance delete obsolete resources %s@%s: %w" , objID , nodeID , err )
585
621
}
586
622
}
587
- // TODO: update update_dash: service_frozen, service_not_on_primary, svcmon_not_updated
588
623
} else {
589
624
if iStatus .resources == nil {
590
625
// scaler or wrapper, for example
@@ -609,54 +644,128 @@ func (d *daemonStatus) dbUpdateInstance() error {
609
644
}
610
645
}
611
646
// TODO: update_container_node_fields
612
- slog .Debug (fmt .Sprintf ("dbUpdateInstance skip encap update %s@%s" , objectName , nodename ))
613
647
}
648
+ if err := d .oDb .dashboardInstanceFrozenUpdate (d .ctx , objID , nodeID , obj .env , iStatus .monFrozen > 0 ); err != nil {
649
+ return fmt .Errorf ("dbUpdateInstance update dashboard instance frozen %s@%s (%s@%s): %w" , objectName , nodename , objID , nodeID , err )
650
+ }
651
+ if err := d .oDb .dashboardDeleteInstanceNotUpdated (d .ctx , objID , nodeID ); err != nil {
652
+ return fmt .Errorf ("dbUpdateInstance update dashboard instance not updated %s@%s (%s@%s): %w" , objectName , nodename , objID , nodeID , err )
653
+ }
654
+ // TODO: verify if we need a placement non optimal alert for object/instance
655
+ // om2 has: monitor.services.'<path>'.placement = non-optimal
656
+ // om3 has: cluster.object.<path>.placement_state = non-optimal
657
+ // cluster.node.<node>.instance.<path>.monitor.is_ha_leader
658
+ // cluster.node.<node>.instance.<path>.monitor.is_leader
659
+ // collector v2 calls update_dash_service_not_on_primary (broken since no DEFAULT.autostart_node values)
614
660
}
615
661
if len (instanceMonitorStates ) == 1 && instanceMonitorStates ["idle" ] {
616
- // TODO: update dashboard service unavailable
617
- // TODO: update dashboard service_placement
618
- // TODO: update dashboard service_available_but_degraded
619
- // TODO: update dashboard flex_instances_started
620
- // TODO: update dashboardsflex_cpu)
662
+ var remove bool
663
+
664
+ remove = slices .Contains ([]string {"up" , "n/a" }, obj .availStatus )
665
+ if err := d .updateDashboardObject (obj , remove , NewDashboardObjectUnavailable ); err != nil {
666
+ return fmt .Errorf ("dbUpdateInstance on %s (%s): %w" , objID , objectName , err )
667
+ }
668
+
669
+ remove = slices .Contains ([]string {"optimal" , "n/a" }, obj .placement )
670
+ if err := d .updateDashboardObject (obj , remove , NewDashboardObjectPlacement ); err != nil {
671
+ return fmt .Errorf ("dbUpdateInstance on %s (%s): %w" , objID , objectName , err )
672
+ }
673
+
674
+ remove = slices .Contains ([]string {"up" , "n/a" }, obj .availStatus ) && slices .Contains ([]string {"up" , "n/a" }, obj .overallStatus )
675
+ if err := d .updateDashboardObject (obj , remove , NewDashboardObjectDegraded ); err != nil {
676
+ return fmt .Errorf ("dbUpdateInstance on %s (%s): %w" , objID , objectName , err )
677
+ }
678
+ // TODO: update_dash_flex_instances_started
679
+ // TODO: update_dash_flex_cpu
621
680
}
622
681
}
623
682
624
- // TODO: purge deleted data for instance (svcmon, dashboard, dashboard_events, svcdisks, resmon, checks_live,
625
- // comp_status, action_queue, resinfo, saves)
626
- //
627
- // TODO: purge deleted data for service (services, svcactions, drpservices, svcmon_log, resmon_log, svcmon_log_ack,
628
- // checks_settings, comp_log, comp_log_daily, comp_rulesets_services, comp_modulesets_services, log,
629
- // action_queue, svc_tags, form_output_results, svcmon_log_last, resmon_log_last)
630
683
return nil
631
684
}
632
685
633
- func (d * daemonStatus ) instanceResourceUpdate (objID string , nodeID string , iStatus * instanceStatus ) error {
686
+ func (d * daemonStatus ) instanceResourceUpdate (objName string , nodename string , iStatus * instanceStatus ) error {
634
687
for _ , res := range iStatus .InstanceResources () {
635
- slog .Debug (fmt .Sprintf ("updating instance resource %s@%s %s" , objID , nodeID , res .rid ))
688
+ slog .Debug (fmt .Sprintf ("updating instance resource %s@%s %s (%s@%s) " , objName , nodename , res .rid , iStatus . svcID , iStatus . nodeID ))
636
689
if err := d .oDb .instanceResourceUpdate (d .ctx , res ); err != nil {
637
690
return fmt .Errorf ("update resource %s: %w" , res .rid , err )
638
691
}
639
- slog .Debug (fmt .Sprintf ("updating instance resource log %s@%s %s" , objID , nodeID , res .rid ))
692
+ slog .Debug (fmt .Sprintf ("updating instance resource log %s@%s %s (%s@%s) " , objName , nodename , res .rid , iStatus . svcID , iStatus . nodeID ))
640
693
if err := d .oDb .instanceResourceLogUpdate (d .ctx , res ); err != nil {
641
694
return fmt .Errorf ("update resource log %s: %w" , res .rid , err )
642
695
}
643
696
}
644
697
return nil
645
698
}
646
699
647
- func (d * daemonStatus ) instanceStatusUpdate (objID string , nodeID string , iStatus * instanceStatus ) error {
648
- slog .Debug (fmt .Sprintf ("updating instance status %s@%s" , objID , nodeID ))
700
+ func (d * daemonStatus ) instanceStatusUpdate (objName string , nodename string , iStatus * instanceStatus ) error {
701
+ slog .Debug (fmt .Sprintf ("updating instance status %s@%s (%s@%s) " , objName , nodename , iStatus . svcID , iStatus . nodeID ))
649
702
if err := d .oDb .instanceStatusUpdate (d .ctx , & iStatus .DBInstanceStatus ); err != nil {
650
703
return fmt .Errorf ("update instance status: %w" , err )
651
704
}
652
- slog .Debug (fmt .Sprintf ("instanceStatusUpdate updating status log %s@%s" , objID , nodeID ))
705
+ slog .Debug (fmt .Sprintf ("instanceStatusUpdate updating status log %s@%s (%s@%s) " , objName , nodename , iStatus . svcID , iStatus . nodeID ))
653
706
err := d .oDb .instanceStatusLogUpdate (d .ctx , & iStatus .DBInstanceStatus )
654
707
if err != nil {
655
708
return fmt .Errorf ("update instance status log: %w" , err )
656
709
}
657
710
return nil
658
711
}
659
712
713
+ func (d * daemonStatus ) dbPurgeInstance () error {
714
+ defer logDuration ("dbPurgeInstance" , time .Now ())
715
+ var nodeIDs , objectNames []string
716
+ for objectName := range d .byObjectName {
717
+ objectNames = append (objectNames , objectName )
718
+ }
719
+ for nodeID := range d .byNodeID {
720
+ nodeIDs = append (nodeIDs , nodeID )
721
+ }
722
+ instanceIDs , err := d .oDb .getOrphanInstances (d .ctx , nodeIDs , objectNames )
723
+ if err != nil {
724
+ return fmt .Errorf ("dbPurgeInstance: getOrphanInstances: %w" , err )
725
+ }
726
+ for _ , instanceID := range instanceIDs {
727
+ if err1 := d .oDb .purgeInstances (d .ctx , instanceID ); err1 != nil {
728
+ err = errors .Join (err , fmt .Errorf ("purge instance %v: %w" , instanceID , err1 ))
729
+ }
730
+ }
731
+ if err != nil {
732
+ return fmt .Errorf ("dbPurgeInstance: %w" , err )
733
+ }
734
+ return nil
735
+ }
736
+
737
+ func (d * daemonStatus ) dbPurgeService () error {
738
+ defer logDuration ("dbPurgeService" , time .Now ())
739
+ objectIDs , err := d .oDb .objectIDWithPurgeTag (d .ctx , d .clusterID )
740
+ if err != nil {
741
+ err = fmt .Errorf ("dbPurgeService: objectIDWithPurgeTag: %w" , err )
742
+ return err
743
+ }
744
+ for _ , objectID := range objectIDs {
745
+ if err1 := d .oDb .purgeObject (d .ctx , objectID ); err1 != nil {
746
+ err = errors .Join (err , fmt .Errorf ("purge object %s: %w" , objectID , err1 ))
747
+ }
748
+ }
749
+ if err != nil {
750
+ return fmt .Errorf ("dbPurgeService: %w" , err )
751
+ }
752
+ return nil
753
+ }
754
+
755
+ func (d * daemonStatus ) pushFromTableChanges () error {
756
+ defer logDuration ("pushFromTableChanges" , time .Now ())
757
+ for _ , tableName := range d .oDb .tableChanges () {
758
+ slog .Debug (fmt .Sprintf ("pushFromTableChanges %s" , tableName ))
759
+ if err := d .oDb .updateTableModified (d .ctx , tableName ); err != nil {
760
+ return fmt .Errorf ("pushFromTableChanges: %w" , err )
761
+ }
762
+ if err := d .ev .EventPublish (tableName + "_change" , nil ); err != nil {
763
+ return fmt .Errorf ("EventPublish send %s: %w" , tableName , err )
764
+ }
765
+ }
766
+ return nil
767
+ }
768
+
660
769
func logDuration (s string , begin time.Time ) {
661
770
slog .Debug (fmt .Sprintf ("STAT: %s elapse: %s" , s , time .Now ().Sub (begin )))
662
771
}
0 commit comments