skynet-中监测服务是否挂掉

写个监测服务,监测所有服务,一旦某个服务dump掉,就立即发个邮件通知一下.

  1. main 里第一个先启动 moniter 服务(unique service)
    skynet.uniqueservice ("moniter")

  2. 其他服务启动后,向 moniter 注册一下,加入moniter的监测列表中

  3. moniter 中定时轮询,安全调用(xpcall) 一下每个监测中的服务的心跳方法(CMD.heart_beat() ),如果服务dump掉,xpcall 会捕捉到错误,并返回error, 此时发个邮件通知一下就可以。


监测服务 moniter

moniter.lua

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
require "functions"
local skynet = require "skynet"
local syslog = require "syslog"

local traceback = debug.traceback
local IntervalTime = 5 * 100
local serTab = {}

local CMD = {}
function CMD.register(source, _serName)
serTab[source] = _serName
end

-- 服务宕机,发邮件通知
local function serviceDump(_serName)
syslog.errf("--- Error: service 【%s】 dump!", _serName)
end

local function callService(_addr)
skynet.call(_addr, "lua", "heart_beat")
end

--[[
检测各个服务是否宕机
]]
local function heartBeatScheduler()
-- syslog.debugf("---------- 【heart beat Begin】 ----------")
for k,v in pairs(serTab) do
local ok, _ = xpcall (callService, traceback, k)
if not ok then
serviceDump(v)
serTab[k] = nil
else
-- syslog.debugf("--- service running:【%s】, addr:%x", v, k)
end
end
-- syslog.debugf("---------- 【heart beat End】 ----------")

skynet.timeout(IntervalTime, heartBeatScheduler)
end

skynet.start (function ()
skynet.timeout(IntervalTime, heartBeatScheduler)
skynet.dispatch ("lua", function (_, source, command, ...)
local f = CMD[command]
if not f then
syslog.warningf ("unhandled message(%s)", command)
return skynet.ret ()
end

local ok, ret = xpcall (f, traceback, source, ...)
if not ok then
syslog.warningf ("handle message(%s) failed : %s", command, ret)
return skynet.ret ()
end
skynet.retpack (ret)
end)
end)


被监测的服务,以 friendserver 为例

friendserver.lua

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
function CMD.open (source, conf)
syslog.debugf("--- friend server open")
local moniter = skynet.uniqueservice ("moniter")
skynet.call(moniter, "lua", "register", "friendserver")
end

function CMD.heart_beat ()
-- print("--- heart_beat friendserver")
end

local traceback = debug.traceback
skynet.start (function ()
-- skynet.timeout (800, function() skynet.exit() end) -- for test moniter

skynet.dispatch ("lua", function (_, source, command, ...)
local f = CMD[command]
if not f then
syslog.warningf ("unhandled message(%s)", command)
return skynet.ret ()
end

local ok, ret = xpcall (f, traceback, source, ...)
if not ok then
syslog.warningf ("handle message(%s) failed : %s", command, ret)
return skynet.ret ()
end
skynet.retpack (ret)
end)
end)

mian.lua中启动

1
2
local friendserver = skynet.uniqueservice ("friendserver")
skynet.call (friendserver, "lua", "open")

a