routes.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. // Copyright 2019 the Kilo authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //go:build linux
  15. // +build linux
  16. package mesh
  17. import (
  18. "net"
  19. "github.com/vishvananda/netlink"
  20. "golang.org/x/sys/unix"
  21. "github.com/squat/kilo/pkg/encapsulation"
  22. "github.com/squat/kilo/pkg/iptables"
  23. )
  24. const kiloTableIndex = 1107
  25. // Routes generates a slice of routes for a given Topology.
  26. func (t *Topology) Routes(kiloIfaceName string, kiloIface, privIface, tunlIface int, local bool, enc encapsulation.Encapsulator) ([]*netlink.Route, []*netlink.Rule) {
  27. var routes []*netlink.Route
  28. var rules []*netlink.Rule
  29. if !t.leader {
  30. // Find the GW for this segment.
  31. // This will be the an IP of the leader.
  32. // In an IPIP encapsulated mesh it is the leader's private IP.
  33. var gw net.IP
  34. for _, segment := range t.segments {
  35. if segment.location == t.location {
  36. gw = enc.Gw(t.updateEndpoint(segment.endpoint, segment.key, &segment.persistentKeepalive).IP(), segment.privateIPs[segment.leader], ipFromIPNet(segment.cniCompatibilityIPs[segment.leader]), segment.cidrs[segment.leader])
  37. break
  38. }
  39. }
  40. for _, segment := range t.segments {
  41. // First, add a route to the WireGuard IP of the segment.
  42. routes = append(routes, encapsulateRoute(&netlink.Route{
  43. Dst: oneAddressCIDR(segment.wireGuardIP),
  44. Flags: int(netlink.FLAG_ONLINK),
  45. Gw: gw,
  46. LinkIndex: privIface,
  47. Protocol: unix.RTPROT_STATIC,
  48. }, enc.Strategy(), t.privateIP, tunlIface))
  49. // Add routes for the current segment if local is true.
  50. if segment.location == t.location {
  51. if local {
  52. for i := range segment.cidrs {
  53. // Don't add routes for the local node.
  54. if segment.privateIPs[i].Equal(t.privateIP.IP) {
  55. continue
  56. }
  57. nodeGw := enc.Gw(nil, segment.privateIPs[i], ipFromIPNet(segment.cniCompatibilityIPs[i]), segment.cidrs[i])
  58. routes = append(routes, encapsulateRoute(&netlink.Route{
  59. Dst: segment.cidrs[i],
  60. Flags: int(netlink.FLAG_ONLINK),
  61. Gw: nodeGw,
  62. LinkIndex: privIface,
  63. Protocol: unix.RTPROT_STATIC,
  64. }, enc.Strategy(), t.privateIP, tunlIface))
  65. // Encapsulate packets from the host's Pod subnet headed
  66. // to private IPs.
  67. if enc.Strategy() == encapsulation.Always || (enc.Strategy() == encapsulation.CrossSubnet && !t.privateIP.Contains(segment.privateIPs[i])) {
  68. routes = append(routes, &netlink.Route{
  69. Dst: oneAddressCIDR(segment.privateIPs[i]),
  70. Flags: int(netlink.FLAG_ONLINK),
  71. Gw: nodeGw,
  72. LinkIndex: tunlIface,
  73. Src: t.privateIP.IP,
  74. Protocol: unix.RTPROT_STATIC,
  75. Table: kiloTableIndex,
  76. })
  77. rules = append(rules, defaultRule(&netlink.Rule{
  78. Src: t.subnet,
  79. Dst: oneAddressCIDR(segment.privateIPs[i]),
  80. Table: kiloTableIndex,
  81. }))
  82. }
  83. }
  84. }
  85. continue
  86. }
  87. for i := range segment.cidrs {
  88. // Add routes to the Pod CIDRs of nodes in other segments.
  89. routes = append(routes, encapsulateRoute(&netlink.Route{
  90. Dst: segment.cidrs[i],
  91. Flags: int(netlink.FLAG_ONLINK),
  92. Gw: gw,
  93. LinkIndex: privIface,
  94. Protocol: unix.RTPROT_STATIC,
  95. }, enc.Strategy(), t.privateIP, tunlIface))
  96. }
  97. for i := range segment.privateIPs {
  98. // Add routes to the private IPs of nodes in other segments.
  99. routes = append(routes, encapsulateRoute(&netlink.Route{
  100. Dst: oneAddressCIDR(segment.privateIPs[i]),
  101. Flags: int(netlink.FLAG_ONLINK),
  102. Gw: gw,
  103. LinkIndex: privIface,
  104. Protocol: unix.RTPROT_STATIC,
  105. }, enc.Strategy(), t.privateIP, tunlIface))
  106. }
  107. // For segments / locations other than the location of this instance of kg,
  108. // we need to set routes for allowed location IPs over the leader in the current location.
  109. for i := range segment.allowedLocationIPs {
  110. routes = append(routes, encapsulateRoute(&netlink.Route{
  111. Dst: &segment.allowedLocationIPs[i],
  112. Flags: int(netlink.FLAG_ONLINK),
  113. Gw: gw,
  114. LinkIndex: privIface,
  115. Protocol: unix.RTPROT_STATIC,
  116. }, enc.Strategy(), t.privateIP, tunlIface))
  117. }
  118. }
  119. // Add routes for the allowed IPs of peers.
  120. for _, peer := range t.peers {
  121. for i := range peer.AllowedIPs {
  122. routes = append(routes, encapsulateRoute(&netlink.Route{
  123. Dst: &peer.AllowedIPs[i],
  124. Flags: int(netlink.FLAG_ONLINK),
  125. Gw: gw,
  126. LinkIndex: privIface,
  127. Protocol: unix.RTPROT_STATIC,
  128. }, enc.Strategy(), t.privateIP, tunlIface))
  129. }
  130. }
  131. return routes, rules
  132. }
  133. // Compute the preferred source address for routes through the WireGuard interface.
  134. // Without this, the kernel picks the WireGuard overlay IP (e.g. 100.66.0.x) as the
  135. // source, which can cause issues in environments like Azure SDN where the overlay
  136. // IP is unknown to the network fabric and reply packets cannot be routed back.
  137. var src net.IP
  138. if t.privateIP != nil {
  139. src = t.privateIP.IP
  140. }
  141. for _, segment := range t.segments {
  142. // Add routes for the current segment if local is true.
  143. if segment.location == t.location {
  144. // If the local node does not have a private IP address,
  145. // then skip adding routes, because the node is in its own location.
  146. if local && t.privateIP != nil {
  147. for i := range segment.cidrs {
  148. // Don't add routes for the local node.
  149. if segment.privateIPs[i].Equal(t.privateIP.IP) {
  150. continue
  151. }
  152. nodeGw := enc.Gw(nil, segment.privateIPs[i], ipFromIPNet(segment.cniCompatibilityIPs[i]), segment.cidrs[i])
  153. routes = append(routes, encapsulateRoute(&netlink.Route{
  154. Dst: segment.cidrs[i],
  155. Flags: int(netlink.FLAG_ONLINK),
  156. Gw: nodeGw,
  157. LinkIndex: privIface,
  158. Protocol: unix.RTPROT_STATIC,
  159. }, enc.Strategy(), t.privateIP, tunlIface))
  160. // Encapsulate packets from the host's Pod subnet headed
  161. // to private IPs.
  162. if enc.Strategy() == encapsulation.Always || (enc.Strategy() == encapsulation.CrossSubnet && !t.privateIP.Contains(segment.privateIPs[i])) {
  163. routes = append(routes, &netlink.Route{
  164. Dst: oneAddressCIDR(segment.privateIPs[i]),
  165. Flags: int(netlink.FLAG_ONLINK),
  166. Gw: nodeGw,
  167. LinkIndex: tunlIface,
  168. Src: t.privateIP.IP,
  169. Protocol: unix.RTPROT_STATIC,
  170. Table: kiloTableIndex,
  171. })
  172. rules = append(rules, defaultRule(&netlink.Rule{
  173. Src: t.subnet,
  174. Dst: oneAddressCIDR(segment.privateIPs[i]),
  175. Table: kiloTableIndex,
  176. }))
  177. // Also encapsulate packets from the Kilo interface
  178. // headed to private IPs.
  179. rules = append(rules, defaultRule(&netlink.Rule{
  180. Dst: oneAddressCIDR(segment.privateIPs[i]),
  181. Table: kiloTableIndex,
  182. IifName: kiloIfaceName,
  183. }))
  184. }
  185. }
  186. }
  187. // When not managing local routes, the leader still needs to
  188. // route return WireGuard traffic through IPIP when non-leaders
  189. // use overlay routing (e.g. Cilium) to reach the leader.
  190. // Use the overlay gateway (e.g. Cilium internal IP) so the
  191. // IPIP outer packet is routed through the overlay tunnel,
  192. // since direct IPIP may be blocked by the cloud network.
  193. if !local && t.privateIP != nil && enc.Strategy() != encapsulation.Never {
  194. for i := range segment.cidrs {
  195. if segment.privateIPs[i].Equal(t.privateIP.IP) {
  196. continue
  197. }
  198. nodeGw := enc.Gw(nil, segment.privateIPs[i], ipFromIPNet(segment.cniCompatibilityIPs[i]), segment.cidrs[i])
  199. if nodeGw != nil && !nodeGw.Equal(segment.privateIPs[i]) {
  200. routes = append(routes, &netlink.Route{
  201. Dst: oneAddressCIDR(segment.privateIPs[i]),
  202. Flags: int(netlink.FLAG_ONLINK),
  203. Gw: nodeGw,
  204. LinkIndex: tunlIface,
  205. Protocol: unix.RTPROT_STATIC,
  206. Table: kiloTableIndex,
  207. })
  208. rules = append(rules, defaultRule(&netlink.Rule{
  209. Dst: oneAddressCIDR(segment.privateIPs[i]),
  210. Table: kiloTableIndex,
  211. IifName: kiloIfaceName,
  212. }))
  213. }
  214. }
  215. }
  216. // Continuing here prevents leaders form adding routes via WireGuard to
  217. // nodes in their own location.
  218. continue
  219. }
  220. for i := range segment.cidrs {
  221. // Add routes to the Pod CIDRs of nodes in other segments.
  222. routes = append(routes, &netlink.Route{
  223. Dst: segment.cidrs[i],
  224. Flags: int(netlink.FLAG_ONLINK),
  225. Gw: segment.wireGuardIP,
  226. LinkIndex: kiloIface,
  227. Src: src,
  228. Protocol: unix.RTPROT_STATIC,
  229. })
  230. // Don't add routes through Kilo if the private IP
  231. // equals the external IP. This means that the node
  232. // is only accessible through an external IP and we
  233. // cannot encapsulate traffic to an IP through the IP.
  234. if segment.privateIPs == nil || segment.privateIPs[i].Equal(t.updateEndpoint(segment.endpoint, segment.key, &segment.persistentKeepalive).IP()) {
  235. continue
  236. }
  237. // Add routes to the private IPs of nodes in other segments.
  238. // Number of CIDRs and private IPs always match so
  239. // we can reuse the loop.
  240. routes = append(routes, &netlink.Route{
  241. Dst: oneAddressCIDR(segment.privateIPs[i]),
  242. Flags: int(netlink.FLAG_ONLINK),
  243. Gw: segment.wireGuardIP,
  244. LinkIndex: kiloIface,
  245. Src: src,
  246. Protocol: unix.RTPROT_STATIC,
  247. })
  248. }
  249. // For segments / locations other than the location of this instance of kg,
  250. // we need to set routes for allowed location IPs over the wg interface.
  251. for i := range segment.allowedLocationIPs {
  252. routes = append(routes, &netlink.Route{
  253. Dst: &segment.allowedLocationIPs[i],
  254. Flags: int(netlink.FLAG_ONLINK),
  255. Gw: segment.wireGuardIP,
  256. LinkIndex: kiloIface,
  257. Src: src,
  258. Protocol: unix.RTPROT_STATIC,
  259. })
  260. }
  261. }
  262. // Add routes for the allowed IPs of peers.
  263. for _, peer := range t.peers {
  264. for i := range peer.AllowedIPs {
  265. routes = append(routes, &netlink.Route{
  266. Dst: &peer.AllowedIPs[i],
  267. LinkIndex: kiloIface,
  268. Src: src,
  269. Protocol: unix.RTPROT_STATIC,
  270. })
  271. }
  272. }
  273. return routes, rules
  274. }
  275. // PeerRoutes generates a slice of routes and rules for a given peer in the Topology.
  276. func (t *Topology) PeerRoutes(name string, kiloIface int, additionalAllowedIPs []net.IPNet) ([]*netlink.Route, []*netlink.Rule) {
  277. var routes []*netlink.Route
  278. var rules []*netlink.Rule
  279. for _, segment := range t.segments {
  280. for i := range segment.cidrs {
  281. // Add routes to the Pod CIDRs of nodes in other segments.
  282. routes = append(routes, &netlink.Route{
  283. Dst: segment.cidrs[i],
  284. Flags: int(netlink.FLAG_ONLINK),
  285. Gw: segment.wireGuardIP,
  286. LinkIndex: kiloIface,
  287. Protocol: unix.RTPROT_STATIC,
  288. })
  289. }
  290. for i := range segment.privateIPs {
  291. // Add routes to the private IPs of nodes in other segments.
  292. routes = append(routes, &netlink.Route{
  293. Dst: oneAddressCIDR(segment.privateIPs[i]),
  294. Flags: int(netlink.FLAG_ONLINK),
  295. Gw: segment.wireGuardIP,
  296. LinkIndex: kiloIface,
  297. Protocol: unix.RTPROT_STATIC,
  298. })
  299. }
  300. // Add routes for the allowed location IPs of all segments.
  301. for i := range segment.allowedLocationIPs {
  302. routes = append(routes, &netlink.Route{
  303. Dst: &segment.allowedLocationIPs[i],
  304. Flags: int(netlink.FLAG_ONLINK),
  305. Gw: segment.wireGuardIP,
  306. LinkIndex: kiloIface,
  307. Protocol: unix.RTPROT_STATIC,
  308. })
  309. }
  310. routes = append(routes, &netlink.Route{
  311. Dst: oneAddressCIDR(segment.wireGuardIP),
  312. LinkIndex: kiloIface,
  313. Protocol: unix.RTPROT_STATIC,
  314. })
  315. }
  316. // Add routes for the allowed IPs of peers.
  317. for _, peer := range t.peers {
  318. // Don't add routes to ourselves.
  319. if peer.Name == name {
  320. continue
  321. }
  322. for i := range peer.AllowedIPs {
  323. routes = append(routes, &netlink.Route{
  324. Dst: &peer.AllowedIPs[i],
  325. LinkIndex: kiloIface,
  326. Protocol: unix.RTPROT_STATIC,
  327. })
  328. }
  329. }
  330. for i := range additionalAllowedIPs {
  331. routes = append(routes, &netlink.Route{
  332. Dst: &additionalAllowedIPs[i],
  333. Flags: int(netlink.FLAG_ONLINK),
  334. Gw: t.segments[0].wireGuardIP,
  335. LinkIndex: kiloIface,
  336. Protocol: unix.RTPROT_STATIC,
  337. })
  338. }
  339. return routes, rules
  340. }
  341. func encapsulateRoute(route *netlink.Route, encapsulate encapsulation.Strategy, subnet *net.IPNet, tunlIface int) *netlink.Route {
  342. if encapsulate == encapsulation.Always || (encapsulate == encapsulation.CrossSubnet && subnet != nil && !subnet.Contains(route.Gw)) {
  343. route.LinkIndex = tunlIface
  344. if subnet != nil && route.Src == nil {
  345. route.Src = subnet.IP
  346. }
  347. }
  348. return route
  349. }
  350. // Rules returns the iptables rules required by the local node.
  351. func (t *Topology) Rules(cni, iptablesForwardRule bool) iptables.RuleSet {
  352. rules := iptables.RuleSet{}
  353. rules.AddToAppend(iptables.NewIPv4Chain("nat", "KILO-NAT"))
  354. rules.AddToAppend(iptables.NewIPv6Chain("nat", "KILO-NAT"))
  355. if cni {
  356. rules.AddToPrepend(iptables.NewRule(iptables.GetProtocol(t.subnet.IP), "nat", "POSTROUTING", "-s", t.subnet.String(), "-m", "comment", "--comment", "Kilo: jump to KILO-NAT chain", "-j", "KILO-NAT"))
  357. // Some linux distros or docker will set forward DROP in the filter table.
  358. // To still be able to have pod to pod communication we need to ALLOW packets from and to pod CIDRs within a location.
  359. // Leader nodes will forward packets from all nodes within a location because they act as a gateway for them.
  360. // Non leader nodes only need to allow packages from and to their own pod CIDR.
  361. if iptablesForwardRule && t.leader {
  362. for _, s := range t.segments {
  363. if s.location == t.location {
  364. // Make sure packets to and from pod cidrs are not dropped in the forward chain.
  365. for _, c := range s.cidrs {
  366. rules.AddToPrepend(iptables.NewRule(iptables.GetProtocol(c.IP), "filter", "FORWARD", "-m", "comment", "--comment", "Kilo: forward packets from the pod subnet", "-s", c.String(), "-j", "ACCEPT"))
  367. rules.AddToPrepend(iptables.NewRule(iptables.GetProtocol(c.IP), "filter", "FORWARD", "-m", "comment", "--comment", "Kilo: forward packets to the pod subnet", "-d", c.String(), "-j", "ACCEPT"))
  368. }
  369. // Make sure packets to and from allowed location IPs are not dropped in the forward chain.
  370. for _, c := range s.allowedLocationIPs {
  371. rules.AddToPrepend(iptables.NewRule(iptables.GetProtocol(c.IP), "filter", "FORWARD", "-m", "comment", "--comment", "Kilo: forward packets from allowed location IPs", "-s", c.String(), "-j", "ACCEPT"))
  372. rules.AddToPrepend(iptables.NewRule(iptables.GetProtocol(c.IP), "filter", "FORWARD", "-m", "comment", "--comment", "Kilo: forward packets to allowed location IPs", "-d", c.String(), "-j", "ACCEPT"))
  373. }
  374. // Make sure packets to and from private IPs are not dropped in the forward chain.
  375. for _, c := range s.privateIPs {
  376. rules.AddToPrepend(iptables.NewRule(iptables.GetProtocol(c), "filter", "FORWARD", "-m", "comment", "--comment", "Kilo: forward packets from private IPs", "-s", oneAddressCIDR(c).String(), "-j", "ACCEPT"))
  377. rules.AddToPrepend(iptables.NewRule(iptables.GetProtocol(c), "filter", "FORWARD", "-m", "comment", "--comment", "Kilo: forward packets to private IPs", "-d", oneAddressCIDR(c).String(), "-j", "ACCEPT"))
  378. }
  379. }
  380. }
  381. } else if iptablesForwardRule {
  382. rules.AddToPrepend(iptables.NewRule(iptables.GetProtocol(t.subnet.IP), "filter", "FORWARD", "-m", "comment", "--comment", "Kilo: forward packets from the node's pod subnet", "-s", t.subnet.String(), "-j", "ACCEPT"))
  383. rules.AddToPrepend(iptables.NewRule(iptables.GetProtocol(t.subnet.IP), "filter", "FORWARD", "-m", "comment", "--comment", "Kilo: forward packets to the node's pod subnet", "-d", t.subnet.String(), "-j", "ACCEPT"))
  384. }
  385. }
  386. for _, s := range t.segments {
  387. rules.AddToPrepend(iptables.NewRule(iptables.GetProtocol(s.wireGuardIP), "nat", "KILO-NAT", "-d", oneAddressCIDR(s.wireGuardIP).String(), "-m", "comment", "--comment", "Kilo: do not NAT packets destined for WireGuared IPs", "-j", "RETURN"))
  388. for _, aip := range s.allowedIPs {
  389. rules.AddToPrepend(iptables.NewRule(iptables.GetProtocol(aip.IP), "nat", "KILO-NAT", "-d", aip.String(), "-m", "comment", "--comment", "Kilo: do not NAT packets destined for known IPs", "-j", "RETURN"))
  390. }
  391. // Make sure packets to allowed location IPs go through the KILO-NAT chain, so they can be MASQUERADEd,
  392. // Otherwise packets to these destinations will reach the destination, but never find their way back.
  393. // We only want to NAT in locations of the corresponding allowed location IPs.
  394. if t.location == s.location {
  395. for _, alip := range s.allowedLocationIPs {
  396. rules.AddToPrepend(iptables.NewRule(iptables.GetProtocol(alip.IP), "nat", "POSTROUTING", "-d", alip.String(), "-m", "comment", "--comment", "Kilo: jump to NAT chain", "-j", "KILO-NAT"))
  397. }
  398. }
  399. }
  400. for _, p := range t.peers {
  401. for _, aip := range p.AllowedIPs {
  402. rules.AddToPrepend(iptables.NewRule(iptables.GetProtocol(aip.IP), "nat", "POSTROUTING", "-s", aip.String(), "-m", "comment", "--comment", "Kilo: jump to NAT chain", "-j", "KILO-NAT"))
  403. rules.AddToPrepend(iptables.NewRule(iptables.GetProtocol(aip.IP), "nat", "KILO-NAT", "-d", aip.String(), "-m", "comment", "--comment", "Kilo: do not NAT packets destined for peers", "-j", "RETURN"))
  404. }
  405. }
  406. for _, s := range t.serviceCIDRs {
  407. rules.AddToAppend(iptables.NewRule(iptables.GetProtocol(s.IP), "nat", "KILO-NAT", "-d", s.String(), "-m", "comment", "--comment", "Kilo: do not NAT packets destined for service CIDRs", "-j", "RETURN"))
  408. }
  409. rules.AddToAppend(iptables.NewIPv4Rule("nat", "KILO-NAT", "-m", "comment", "--comment", "Kilo: NAT remaining packets", "-j", "MASQUERADE"))
  410. rules.AddToAppend(iptables.NewIPv6Rule("nat", "KILO-NAT", "-m", "comment", "--comment", "Kilo: NAT remaining packets", "-j", "MASQUERADE"))
  411. return rules
  412. }
  413. func defaultRule(rule *netlink.Rule) *netlink.Rule {
  414. base := netlink.NewRule()
  415. base.Src = rule.Src
  416. base.Dst = rule.Dst
  417. base.IifName = rule.IifName
  418. base.Table = rule.Table
  419. return base
  420. }