Skip to content

Commit e656f8c

Browse files
committed
improved checking diagnostics and connections to services
1 parent d04cd03 commit e656f8c

File tree

12 files changed

+109
-41
lines changed

12 files changed

+109
-41
lines changed

server/kraken/server/badge.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@
1919
from flask import abort, redirect
2020

2121
from . import consts
22+
from . import utils
2223

2324

2425
def get_branch_badge(branch_id, what=None):
2526
# get redis reference
2627
redis_addr = os.environ.get('KRAKEN_REDIS_ADDR', consts.DEFAULT_REDIS_ADDR)
27-
rds = redis.Redis(host=redis_addr, port=6379, db=consts.REDIS_KRAKEN_DB)
28+
redis_host, redis_port = utils.split_host_port(redis_addr, 6379)
29+
rds = redis.Redis(host=redis_host, port=redis_port, db=consts.REDIS_KRAKEN_DB)
2830

2931
try:
3032
int(branch_id)

server/kraken/server/bg/jobs.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,8 @@ def _prepare_flow_summary(flow):
120120

121121
# get redis reference
122122
redis_addr = os.environ.get('KRAKEN_REDIS_ADDR', consts.DEFAULT_REDIS_ADDR)
123-
rds = redis.Redis(host=redis_addr, port=6379, db=consts.REDIS_KRAKEN_DB)
123+
redis_host, redis_port = utils.split_host_port(redis_addr, 6379)
124+
rds = redis.Redis(host=redis_host, port=redis_port, db=consts.REDIS_KRAKEN_DB)
124125

125126
# prepare flow summary
126127
errors = False

server/kraken/server/consts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@
124124

125125

126126
DEFAULT_DB_URL = 'postgresql://kraken:kk123@localhost:5433/kraken'
127-
DEFAULT_REDIS_ADDR = 'localhost'
127+
DEFAULT_REDIS_ADDR = 'localhost:6379'
128128
DEFAULT_CLICKHOUSE_PORT = '9001'
129129
DEFAULT_CLICKHOUSE_ADDR = 'localhost:%s' % DEFAULT_CLICKHOUSE_PORT
130130
DEFAULT_CLICKHOUSE_URL = 'http://localhost:8123'

server/kraken/server/kkrq.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from . import srvcheck
2626
from . import consts
2727
from . import logs
28+
from . import utils
2829
from .. import version
2930
from .models import db, get_setting
3031

@@ -33,7 +34,8 @@
3334

3435
def enq(func, *args, **kwargs):
3536
redis_addr = os.environ.get('KRAKEN_REDIS_ADDR', consts.DEFAULT_REDIS_ADDR)
36-
rds = redis.Redis(host=redis_addr, port=6379, db=consts.REDIS_RQ_DB)
37+
redis_host, redis_port = utils.split_host_port(redis_addr, 6379)
38+
rds = redis.Redis(host=redis_host, port=redis_port, db=consts.REDIS_RQ_DB)
3739
q = rq.Queue('kq', connection=rds)
3840
# job timeout is 20mins (1200s), interval between retries is 60s
3941
j = q.enqueue(func, args=args, kwargs=kwargs, retry=rq.Retry(max=2, interval=5), job_timeout=1200)
@@ -43,7 +45,8 @@ def enq(func, *args, **kwargs):
4345

4446
def enq_neck(func, *args, ignore_args=None):
4547
redis_addr = os.environ.get('KRAKEN_REDIS_ADDR', consts.DEFAULT_REDIS_ADDR)
46-
rds = redis.Redis(host=redis_addr, port=6379, db=consts.REDIS_RQ_DB)
48+
redis_host, redis_port = utils.split_host_port(redis_addr, 6379)
49+
rds = redis.Redis(host=redis_host, port=redis_port, db=consts.REDIS_RQ_DB)
4750
data = dict(func=func.__name__,
4851
args=args,
4952
ignore_args=ignore_args)
@@ -54,7 +57,8 @@ def enq_neck(func, *args, ignore_args=None):
5457

5558
def get_jobs():
5659
redis_addr = os.environ.get('KRAKEN_REDIS_ADDR', consts.DEFAULT_REDIS_ADDR)
57-
rds = redis.Redis(host=redis_addr, port=6379, db=consts.REDIS_RQ_DB)
60+
redis_host, redis_port = utils.split_host_port(redis_addr, 6379)
61+
rds = redis.Redis(host=redis_host, port=redis_port, db=consts.REDIS_RQ_DB)
5862
q = rq.Queue('kq', connection=rds)
5963

6064
jobs_ids = q.scheduled_job_registry.get_job_ids()
@@ -87,11 +91,12 @@ def get_jobs():
8791
def main():
8892
# check deps
8993
planner_url = os.environ.get('KRAKEN_PLANNER_URL', consts.DEFAULT_PLANNER_URL)
90-
srvcheck.check_url('planner', planner_url, 7997)
94+
srvcheck.wait_for_service('planner', planner_url, 7997)
9195

9296
redis_addr = os.environ.get('KRAKEN_REDIS_ADDR', consts.DEFAULT_REDIS_ADDR)
93-
srvcheck.check_tcp_service('redis', redis_addr, 6379)
94-
rds = redis.Redis(host=redis_addr, port=6379, db=consts.REDIS_RQ_DB)
97+
srvcheck.wait_for_service('redis', redis_addr, 6379)
98+
redis_host, redis_port = utils.split_host_port(redis_addr, 6379)
99+
rds = redis.Redis(host=redis_host, port=redis_port, db=consts.REDIS_RQ_DB)
95100

96101
db_url = os.environ.get('KRAKEN_DB_URL', consts.DEFAULT_DB_URL)
97102
srvcheck.check_postgresql(db_url)

server/kraken/server/management.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from .cloud import aws, azure, k8s
3939
from . import notify
4040
from . import utils
41+
from . import minioops
4142

4243

4344
log = logging.getLogger(__name__)
@@ -750,7 +751,7 @@ def get_diagnostics():
750751

751752
# check postgresql
752753
pgsql_addr = os.environ.get('KRAKEN_DB_URL', consts.DEFAULT_DB_URL)
753-
pgsql_open = srvcheck.is_addr_open(pgsql_addr)
754+
pgsql_open = srvcheck.is_service_open(pgsql_addr)
754755
diags['postgresql'] = {
755756
'name': 'PostgreSQL',
756757
'address': pgsql_addr,
@@ -759,7 +760,7 @@ def get_diagnostics():
759760

760761
# check clickhouse
761762
ch_url = os.environ.get('KRAKEN_CLICKHOUSE_URL', consts.DEFAULT_CLICKHOUSE_URL)
762-
ch_open = srvcheck.is_addr_open(ch_url)
763+
ch_open = srvcheck.is_service_open(ch_url)
763764
diags['clickhouse'] = {
764765
'name': 'ClickHouse',
765766
'address': ch_url,
@@ -768,16 +769,27 @@ def get_diagnostics():
768769

769770
# check redis
770771
rds_addr = os.environ.get('KRAKEN_REDIS_ADDR', consts.DEFAULT_REDIS_ADDR)
771-
rds_open = srvcheck.is_addr_open(rds_addr, 6379)
772+
rds_open = srvcheck.is_service_open(rds_addr)
772773
diags['redis'] = {
773774
'name': 'Redis',
774775
'address': rds_addr,
775776
'open': rds_open
776777
}
777778

779+
# check minio
780+
minio_addr, _, _ = minioops.get_minio_addr()
781+
minio_open = srvcheck.is_service_open(minio_addr)
782+
if minio_open:
783+
minio_open = minioops.check_connection()
784+
diags['minio'] = {
785+
'name': 'MinIO',
786+
'address': minio_addr,
787+
'open': minio_open
788+
}
789+
778790
# check planner
779791
plnr_addr = os.environ.get('KRAKEN_PLANNER_URL', consts.DEFAULT_PLANNER_URL)
780-
plnr_open = srvcheck.is_addr_open(plnr_addr)
792+
plnr_open = srvcheck.is_service_open(plnr_addr)
781793
diags['planner'] = {
782794
'name': 'Kraken Planner',
783795
'address': plnr_addr,
@@ -892,7 +904,8 @@ def get_services_logs(services, level=None):
892904

893905
def get_errors_in_logs_count():
894906
redis_addr = os.environ.get('KRAKEN_REDIS_ADDR', consts.DEFAULT_REDIS_ADDR)
895-
rds = redis.Redis(host=redis_addr, port=6379, db=consts.REDIS_KRAKEN_DB)
907+
redis_host, redis_port = utils.split_host_port(redis_addr, 6379)
908+
rds = redis.Redis(host=redis_host, port=redis_port, db=consts.REDIS_KRAKEN_DB)
896909

897910
errors_count = rds.get('error-logs-count')
898911
if errors_count:

server/kraken/server/qneck.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from . import logs
2525
from . import consts
2626
from . import srvcheck
27+
from . import utils
2728
from .. import version
2829
from .bg import jobs as bg_jobs
2930
from . import kkrq
@@ -60,8 +61,9 @@ def _check_jobs(waiting_jobs, executing_jobs):
6061

6162
def _main_loop():
6263
redis_addr = os.environ.get('KRAKEN_REDIS_ADDR', consts.DEFAULT_REDIS_ADDR)
63-
srvcheck.check_tcp_service('redis', redis_addr, 6379)
64-
rds = redis.Redis(host=redis_addr, port=6379, db=consts.REDIS_RQ_DB)
64+
srvcheck.wait_for_service('redis', redis_addr, 6379)
65+
redis_host, redis_port = utils.split_host_port(redis_addr, 6379)
66+
rds = redis.Redis(host=redis_host, port=redis_port, db=consts.REDIS_RQ_DB)
6567

6668
ps = rds.pubsub()
6769
ps.subscribe('qneck')

server/kraken/server/scheduler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def create_app():
133133
planner_url = os.environ.get('KRAKEN_PLANNER_URL', consts.DEFAULT_PLANNER_URL)
134134

135135
srvcheck.check_postgresql(db_url)
136-
srvcheck.check_url('planner', planner_url, 7997)
136+
srvcheck.wait_for_service('planner', planner_url, 7997)
137137

138138
logs.setup_logging('scheduler')
139139
log.info('Kraken Scheduler started, version %s', version.version)

server/kraken/server/server.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from . import storage
3232
from . import job_log
3333
from . import badge
34+
from . import minioops
3435
from .. import version
3536

3637
log = logging.getLogger('server')
@@ -68,11 +69,24 @@ def _set_log_ctx():
6869
pass
6970

7071

71-
def _clear_log_ctx(a): # pylint: disable=unused-argument
72+
def _clear_request_ctx(a): # pylint: disable=unused-argument
7273
try:
7374
log.set_ctx(tool=None)
7475
except Exception:
75-
pass
76+
log.exception('IGNORED')
77+
78+
try:
79+
models.db.session.remove()
80+
except Exception:
81+
log.exception('IGNORED')
82+
83+
84+
def _unhandled_error_handler(err):
85+
log.info('ERROR %s', err)
86+
print('ERROR', err)
87+
log.info('ORIG ERROR %s', err.original_exception)
88+
print('ORIG ERROR', err.original_exception)
89+
return '{}', 500
7690

7791

7892
def create_app():
@@ -84,9 +98,9 @@ def create_app():
8498
server_addr = os.environ.get('KRAKEN_SERVER_ADDR', consts.DEFAULT_SERVER_ADDR)
8599

86100
srvcheck.check_postgresql(db_url)
87-
srvcheck.check_tcp_service('redis', redis_addr, 6379)
88-
srvcheck.check_url('clickhouse', clickhouse_url, 8123)
89-
srvcheck.check_url('planner', planner_url, 7997)
101+
srvcheck.wait_for_service('redis', redis_addr, 6379)
102+
srvcheck.wait_for_service('clickhouse', clickhouse_url, 8123)
103+
srvcheck.wait_for_service('planner', planner_url, 7997)
90104

91105
logs.setup_logging('server')
92106
log.info('Kraken Server started, version %s', version.version)
@@ -107,11 +121,17 @@ def create_app():
107121
# initialize SqlAlchemy
108122
models.db.init_app(app)
109123

110-
# setup sentry
111124
with app.app_context():
125+
# setup sentry
112126
sentry_url = models.get_setting('monitoring', 'sentry_dsn')
113127
logs.setup_sentry(sentry_url)
114128

129+
# check minio connection
130+
minio_conn = minioops.check_connection()
131+
if not minio_conn:
132+
minio_addr, _, _ = minioops.get_minio_addr()
133+
log.warning('No connection to minio at %s', minio_addr)
134+
115135
# Read the swagger.yml file to configure the endpoints
116136
connex_app.add_api("swagger.yml", resolver=MyResolver())
117137

@@ -137,7 +157,9 @@ def create_app():
137157
app.add_url_rule("/branch-badge/<branch_id>/<what>", view_func=badge.get_branch_badge, methods=['GET'])
138158

139159
app.before_request(_set_log_ctx)
140-
app.teardown_request(_clear_log_ctx)
160+
app.teardown_request(_clear_request_ctx)
161+
162+
app.register_error_handler(500, _unhandled_error_handler)
141163

142164
return connex_app
143165

server/kraken/server/srvcheck.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,28 +16,43 @@
1616
import socket
1717
from urllib.parse import urlparse
1818

19+
from . import utils
20+
1921

2022
def _is_service_open(addr, port, sock_type):
2123
with socket.socket(socket.AF_INET, sock_type) as s:
2224
s.settimeout(1)
2325
try:
24-
s.connect((addr, int(port)))
26+
s.connect((addr, port))
2527
s.shutdown(socket.SHUT_RDWR)
2628
return True
2729
except Exception:
2830
return False
2931

3032

31-
def is_addr_open(url, default_port=None):
32-
o = urlparse(url)
33-
return _is_service_open(o.hostname, o.port or default_port, socket.SOCK_STREAM)
33+
def _parse_url_or_addr(url_addr, default_port=None):
34+
o = urlparse(url_addr)
35+
if o.hostname:
36+
host = o.hostname
37+
port = o.port or default_port
38+
else:
39+
host, port = utils.split_host_port(url_addr, default_port)
40+
41+
return host, port
42+
43+
44+
def is_service_open(url_addr, default_port=None):
45+
host, port = _parse_url_or_addr(url_addr, default_port)
46+
return _is_service_open(host, port, socket.SOCK_STREAM)
3447

3548

36-
def check_tcp_service(name, addr, port):
49+
def wait_for_service(name, url_addr, default_port):
50+
host, port = _parse_url_or_addr(url_addr, default_port)
51+
3752
attempt = 1
38-
trace = "checking TCP service %s on %s:%d..." % (name, addr, port)
53+
trace = "checking TCP service %s on %s:%d..." % (name, host, port)
3954
print("%s %d." % (trace, attempt))
40-
while not _is_service_open(addr, port, socket.SOCK_STREAM):
55+
while not _is_service_open(host, port, socket.SOCK_STREAM):
4156
if attempt < 3:
4257
time.sleep(2)
4358
elif attempt < 10:
@@ -49,10 +64,5 @@ def check_tcp_service(name, addr, port):
4964
print("%s is up" % name)
5065

5166

52-
def check_url(name, url, default_port):
53-
o = urlparse(url)
54-
check_tcp_service(name, o.hostname, o.port or default_port)
55-
56-
5767
def check_postgresql(url):
58-
check_url('postgresql', url, 5432)
68+
wait_for_service('postgresql', url, 5432)

server/kraken/server/utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,15 @@
2222

2323
def utcnow():
2424
return datetime.datetime.now(pytz.utc)
25+
26+
27+
def split_host_port(addr, default_port):
28+
parts = addr.split(':')
29+
host = parts[0]
30+
if len(parts) == 1:
31+
if default_port is None:
32+
raise Exception("format of address '%s' is incorrect" % addr)
33+
port = default_port
34+
else:
35+
port = int(parts[1])
36+
return host, port

0 commit comments

Comments
 (0)