Fork me on GitHub

Golang服务器热重启、热升级、热更新(safe and graceful hot-restart/reload http server)详解

原理

热重启的原理比较简单,但是涉及到一些系统调用以及父子进程之间文件句柄的传递等等细节比较多。
处理过程分为以下几个步骤:

监听信号(USR2..)

收到信号时fork子进程(使用相同的启动命令),将服务监听的socket文件描述符传递给子进程
子进程监听父进程的socket,这个时候父进程和子进程都可以接收请求
子进程启动成功之后,父进程停止接收新的连接,等待旧连接处理完成(或超时)
父进程退出,重启完成

细节

父进程将socket文件描述符传递给子进程可以通过命令行,或者环境变量等
子进程启动时使用和父进程一样的命令行,对于golang来说用更新的可执行程序覆盖旧程序
server.Shutdown()优雅关闭方法是go>=1.8的新特性
server.Serve(l)方法在Shutdown时立即返回,Shutdown方法则阻塞至context完成,所以Shutdown的方法要写在主goroutine中

代码

  1. package main
  2. import (
  3. "context"
  4. "errors"
  5. "flag"
  6. "log"
  7. "net"
  8. "net/http"
  9. "os"
  10. "os/exec"
  11. "os/signal"
  12. "syscall"
  13. "time"
  14. )
  15. var (
  16. server *http.Server
  17. listener net.Listener
  18. graceful = flag.Bool("graceful", false, "listen on fd open 3 (internal use only)")
  19. )
  20. func handler(w http.ResponseWriter, r *http.Request) {
  21. time.Sleep(20 * time.Second)
  22. w.Write([]byte("hello world233333!!!!"))
  23. }
  24. func main() {
  25. flag.Parse()
  26. http.HandleFunc("/hello", handler)
  27. server = &http.Server{Addr: ":9999"}
  28. var err error
  29. if *graceful {
  30. log.Print("main: Listening to existing file descriptor 3.")
  31. // cmd.ExtraFiles: If non-nil, entry i becomes file descriptor 3+i.
  32. // when we put socket FD at the first entry, it will always be 3(0+3)
  33.      //为什么是3呢,而不是1 0 或者其他数字?这是因为父进程里给了个fd给子进程了 而子进程里0,1,2是预留给 标准输入、输出和错误的,所以父进程给的第一个fd在子进程里顺序排就是从3开始了;如果fork的时候cmd.ExtraFiles给了两个文件句柄,那么子进程里还可以用4开始,就看你开了几个子进程自增就行。因为我这里就开一个子进程所以把3写死了。l, err = net.FileListener(f)这一步只是把 fd描述符包装进TCPListener这个结构体。
  34. f := os.NewFile(3, "")
  35.      //先复制fd到新的fd, 然后设置子进程exec时自动关闭父进程的fd,即“F_DUPFD_CLOEXEC”
  36. listener, err = net.FileListener(f)
  37. } else {
  38. log.Print("main: Listening on a new file descriptor.")
  39. listener, err = net.Listen("tcp", server.Addr)
  40. }
  41. if err != nil {
  42. log.Fatalf("listener error: %v", err)
  43. }
  44. go func() {
  45. // server.Shutdown() stops Serve() immediately, thus server.Serve() should not be in main goroutine
  46. err = server.Serve(listener)
  47. log.Printf("server.Serve err: %v\n", err)
  48. }()
  49. signalHandler()
  50. log.Printf("signal end")
  51. }
  52. func reload() error {
  53. tl, ok := listener.(*net.TCPListener)
  54. if !ok {
  55. return errors.New("listener is not tcp listener")
  56. }
  57. f, err := tl.File()
  58. if err != nil {
  59. return err
  60. }
  61. args := []string{"-graceful"}
  62. cmd := exec.Command(os.Args[0], args...)
  63. cmd.Stdout = os.Stdout
  64. cmd.Stderr = os.Stderr
  65. // put socket FD at the first entry
  66. cmd.ExtraFiles = []*os.File{f}
  67. return cmd.Start()
  68. }
  69. func signalHandler() {
  70. ch := make(chan os.Signal, 1)
  71. signal.Notify(ch, syscall.SIGINT, syscall.SIGTERM, syscall.SIGUSR2)
  72. for {
  73. sig := <-ch
  74. log.Printf("signal: %v", sig)
  75. // timeout context for shutdown
  76. ctx, _ := context.WithTimeout(context.Background(), 20*time.Second)
  77. switch sig {
  78. case syscall.SIGINT, syscall.SIGTERM:
  79. // stop
  80. log.Printf("stop")
  81. signal.Stop(ch)
  82. server.Shutdown(ctx)
  83. log.Printf("graceful shutdown")
  84. return
  85. case syscall.SIGUSR2:
  86. // reload
  87. log.Printf("reload")
  88. err := reload()
  89. if err != nil {
  90. log.Fatalf("graceful restart error: %v", err)
  91. }
  92. server.Shutdown(ctx)
  93. log.Printf("graceful reload")
  94. return
  95. }
  96. }
  97. }

我的实现

  1. package main
  2. import (
  3. "net"
  4. "net/http"
  5. "time"
  6. "log"
  7. "syscall"
  8. "os"
  9. "os/signal"
  10. "context"
  11. "fmt"
  12. "os/exec"
  13. "flag"
  14. )
  15. var (
  16. listener net.Listener
  17. err error
  18. server http.Server
  19. graceful = flag.Bool("g", false, "listen on fd open 3 (internal use only)")
  20. )
  21. type MyHandler struct {
  22. }
  23. func (*MyHandler)ServeHTTP(w http.ResponseWriter, r *http.Request){
  24. fmt.Println("request start at ", time.Now(), r.URL.Path+"?"+r.URL.RawQuery, "request done at ", time.Now(), " pid:", os.Getpid())
  25. time.Sleep(10 * time.Second)
  26. w.Write([]byte("this is test response"))
  27. fmt.Println("request done at ", time.Now(), " pid:", os.Getpid() )
  28. }
  29. func main() {
  30. flag.Parse()
  31. fmt.Println("start-up at " , time.Now(), *graceful)
  32. if *graceful {
  33. f := os.NewFile(3, "")
  34. listener, err = net.FileListener(f)
  35. fmt.Printf( "graceful-reborn %v %v %#v \n", f.Fd(), f.Name(), listener)
  36. }else{
  37. listener, err = net.Listen("tcp", ":1111")
  38. tcp,_ := listener.(*net.TCPListener)
  39. fd,_ := tcp.File()
  40. fmt.Printf( "first-boot %v %v %#v \n ", fd.Fd(),fd.Name(), listener)
  41. }
  42. server := http.Server{
  43. Handler: &MyHandler{},
  44. ReadTimeout: 6 * time.Second,
  45. }
  46. log.Printf("Actual pid is %d\n", syscall.Getpid())
  47. if err != nil {
  48. println(err)
  49. return
  50. }
  51. log.Printf(" listener: %v\n", listener)
  52. go func(){//不要阻塞主进程
  53. err := server.Serve(listener)
  54. if err != nil {
  55. log.Println(err)
  56. }
  57. }()
  58. //signals
  59. func(){
  60. ch := make(chan os.Signal, 1)
  61. signal.Notify(ch, syscall.SIGHUP, syscall.SIGTERM)
  62. for{//阻塞主进程, 不停的监听系统信号
  63. sig := <- ch
  64. log.Printf("signal: %v", sig)
  65. ctx, _ := context.WithTimeout(context.Background(), 20*time.Second)
  66. switch sig {
  67. case syscall.SIGTERM, syscall.SIGHUP:
  68. println("signal cause reloading")
  69. signal.Stop(ch)
  70. {//fork new child process
  71. tl, ok := listener.(*net.TCPListener)
  72. if !ok {
  73. fmt.Println("listener is not tcp listener")
  74. return
  75. }
  76. currentFD, err := tl.File()
  77. if err != nil {
  78. fmt.Println("acquiring listener file failed")
  79. return
  80. }
  81. cmd := exec.Command(os.Args[0], "-g")
  82. cmd.ExtraFiles, cmd.Stdout,cmd.Stderr = []*os.File{currentFD} ,os.Stdout, os.Stderr
  83. err = cmd.Start()
  84. if err != nil {
  85. fmt.Println("cmd.Start fail: ", err)
  86. return
  87. }
  88. fmt.Println("forked new pid : ",cmd.Process.Pid)
  89. }
  90. server.Shutdown(ctx)
  91. fmt.Println("graceful shutdown at ", time.Now())
  92. }
  93. }
  94. }()
  95. }
  1. qiangjian@sun-pro:/data1/works/IdeaProjects/go_core$ go run src/wright/hotrestart/booter.go
  2. start-up at 2018-10-12 15:29:34.586269 +0800 CST m=+0.004439497 false
  3. first-boot 5 tcp:[::]:1111-> &net.TCPListener{fd:(*net.netFD)(0xc00010e000)}
  4. 2018/10/12 15:29:34 Actual pid is 10771
  5. 2018/10/12 15:29:34 listener: &{0xc00010e000}
  6. request start at 2018-10-12 15:29:40.287928 +0800 CST m=+5.705965906 /aa/bb?c=d request done at 2018-10-12 15:29:40.287929 +0800 CST m=+5.705966554 pid: 10771
  7. 2018/10/12 15:29:49 signal: terminated
  8. signal cause reloading
  9. forked new pid : 10775
  10. start-up at 2018-10-12 15:29:49.689064 +0800 CST m=+0.001613279 true
  11. graceful-reborn 3 &net.TCPListener{fd:(*net.netFD)(0xc0000ec000)}
  12. 2018/10/12 15:29:49 Actual pid is 10775
  13. 2018/10/12 15:29:49 listener: &{0xc0000ec000}
  14. request done at 2018-10-12 15:29:50.288525 +0800 CST m=+15.706330718 pid: 10771
  15. 2018/10/12 15:29:50 http: Server closed
  16. request start at 2018-10-12 15:29:50.290622 +0800 CST m=+15.708426906 /aa/bb?c=d request done at 2018-10-12 15:29:50.290623 +0800 CST m=+15.708428113 pid: 10771
  17. request start at 2018-10-12 15:29:50.290713 +0800 CST m=+0.603248262 /aa/bb?c=d request done at 2018-10-12 15:29:50.290714 +0800 CST m=+0.603249293 pid: 10775
  18. request done at 2018-10-12 15:30:00.293988 +0800 CST m=+10.606290169 pid: 10775
  19. request done at 2018-10-12 15:30:00.294043 +0800 CST m=+25.711615717 pid: 10771
  20. request start at 2018-10-12 15:30:00.295554 +0800 CST m=+10.607856283 /aa/bb?c=d request done at 2018-10-12 15:30:00.295555 +0800 CST m=+10.607857307 pid: 10775
  21. request start at 2018-10-12 15:30:00.29558 +0800 CST m=+10.607881997 /aa/bb?c=d request done at 2018-10-12 15:30:00.295581 +0800 CST m=+10.607883004 pid: 10775
  22. graceful shutdown at 2018-10-12 15:30:00.79544 +0800 CST m=+26.213000502
  1. ab -v -k -c2 -n100 '127.0.0.1:1111/aa/bb?c=d'
  2. This is ApacheBench, Version 2.3 <$Revision: 1826891 $>
  3. Copyright 1996 Adam Twiss, Zeus Technology Ltd, http://www.zeustech.net/
  4. Licensed to The Apache Software Foundation, http://www.apache.org/
  5. Benchmarking 127.0.0.1 (be patient)...^C
  6. Server Software:
  7. Server Hostname: 127.0.0.1
  8. Server Port: 1111
  9. Document Path: /aa/bb?c=d
  10. Document Length: 21 bytes
  11. Concurrency Level: 2
  12. Time taken for tests: 48.292 seconds
  13. Complete requests: 7
  14. Failed requests: 0
  15. Total transferred: 966 bytes
  16. HTML transferred: 147 bytes
  17. Requests per second: 0.14 [#/sec] (mean)
  18. Time per request: 13797.702 [ms] (mean)
  19. Time per request: 6898.851 [ms] (mean, across all concurrent requests)
  20. Transfer rate: 0.02 [Kbytes/sec] received
  1. kill 进程ID #发送TERM信号
  1. //还有一种方式去fork,和上面本质一样:
  2. execSpec := &syscall.ProcAttr{
  3. Env: os.Environ(),
  4. Files: []uintptr{os.Stdin.Fd(), os.Stdout.Fd(), os.Stderr.Fd(), lFd},
  5. }
  6. pid, err := syscall.ForkExec(os.Args[0], os.Args, execSpec)

可以看出: Ab测试器Failed为0,且Console中显示老请求处理完后才Shutdown,即在Kill触发Reload后,请求无论是老进程的旧请求,还是Fork子进程后的新请求,全都处理成功,没有失败的。 这就是我们说的热重启!

Systemd & Supervisor

父进程退出之后,子进程会挂到1号进程上面。这种情况下使用systemd和supervisord等管理程序会显示进程处于failed的状态。解决这个问题有两个方法:

  1. 使用pidfile,每次进程重启更新一下pidfile,让进程管理者通过这个文件感知到main pid的变更。
  2. 更通用的做法:起一个master来管理服务进程,每次热重启master拉起一个新的进程,把旧的kill掉。这时master的pid没有变化,对于进程管理者来说进程处于正常的状态。

FD复制时细节

请看:

  1. https://blog.csdn.net/ChrisNiu1984/article/details/7050663

  2. http://man7.org/linux/man-pages/man2/fcntl.2.html#F_DUPFD_CLOEXEC

原文连接:https://www.cnblogs.com/sunsky303/p/9778466.html

2020-06-19 16:44:46  LeeChan 阅读(18) 评论(0) 标签:热升级,热更新 分类:技术编程