diff --git a/discovery/dns/dns.go b/discovery/dns/dns.go index 61ab2dd3a3..40077246cc 100644 --- a/discovery/dns/dns.go +++ b/discovery/dns/dns.go @@ -124,7 +124,7 @@ func (d *Discovery) refreshAll(ctx context.Context, ch chan<- []*config.TargetGr } func (d *Discovery) refresh(ctx context.Context, name string, ch chan<- []*config.TargetGroup) error { - response, err := lookupAll(name, d.qtype, d.logger) + response, err := lookupWithSearchPath(name, d.qtype, d.logger) dnsSDLookupsCount.Inc() if err != nil { dnsSDLookupFailuresCount.Inc() @@ -151,7 +151,6 @@ func (d *Discovery) refresh(ctx context.Context, name string, ch chan<- []*confi default: d.logger.Warnf("%q is not a valid SRV record", record) continue - } tg.Targets = append(tg.Targets, model.LabelSet{ model.AddressLabel: target, @@ -169,39 +168,113 @@ func (d *Discovery) refresh(ctx context.Context, name string, ch chan<- []*confi return nil } -func lookupAll(name string, qtype uint16, logger log.Logger) (*dns.Msg, error) { +// lookupWithSearchPath tries to get an answer for various permutations of +// the given name, appending the system-configured search path as necessary. +// +// There are three possible outcomes: +// +// 1. One of the permutations of the given name is recognised as +// "valid" by the DNS, in which case we consider ourselves "done" +// and that answer is returned. Note that, due to the way the DNS +// handles "name has resource records, but none of the specified type", +// the answer received may have an empty set of results. +// +// 2. All of the permutations of the given name are responded to by one of +// the servers in the "nameservers" list with the answer "that name does +// not exist" (NXDOMAIN). In that case, it can be considered +// pseudo-authoritative that there are no records for that name. +// +// 3. One or more of the names was responded to by all servers with some +// sort of error indication. In that case, we can't know if, in fact, +// there are records for the name or not, so whatever state the +// configuration is in, we should keep it that way until we know for +// sure (by, presumably, all the names getting answers in the future). +// +// Outcomes 1 and 2 are indicated by a valid response message (possibly an +// empty one) and no error. Outcome 3 is indicated by an error return. The +// error will be generic-looking, because trying to return all the errors +// returned by the combination of all name permutations and servers is a +// nightmare. +func lookupWithSearchPath(name string, qtype uint16, logger log.Logger) (*dns.Msg, error) { conf, err := dns.ClientConfigFromFile(resolvConf) if err != nil { return nil, fmt.Errorf("could not load resolv.conf: %s", err) } + allResponsesValid := true + + for _, lname := range conf.NameList(name) { + response, err := lookupFromAnyServer(lname, qtype, conf, logger) + + if err != nil { + // We can't go home yet, because a later name + // may give us a valid, successful answer. However + // we can no longer say "this name definitely doesn't + // exist", because we did not get that answer for + // at least one name. + allResponsesValid = false + } else if response.Rcode == dns.RcodeSuccess { + // Outcome 1: GOLD! + return response, nil + } + } + + if allResponsesValid { + // Outcome 2: everyone says NXDOMAIN, that's good enough for me + return &dns.Msg{}, nil + } + // Outcome 3: boned. + return nil, fmt.Errorf("could not resolve %q: all servers responded with errors to at least one search domain", name) +} + +// lookupFromAnyServer uses all configured servers to try and resolve a specific +// name. If a viable answer is received from a server, then it is +// immediately returned, otherwise the other servers in the config are +// tried, and if none of them return a viable answer, an error is returned. +// +// A "viable answer" is one which indicates either: +// +// 1. "yes, I know that name, and here are its records of the requested type" +// (RCODE==SUCCESS, ANCOUNT > 0); +// 2. "yes, I know that name, but it has no records of the requested type" +// (RCODE==SUCCESS, ANCOUNT==0); or +// 3. "I know that name doesn't exist" (RCODE==NXDOMAIN). +// +// A non-viable answer is "anything else", which encompasses both various +// system-level problems (like network timeouts) and also +// valid-but-unexpected DNS responses (SERVFAIL, REFUSED, etc). +func lookupFromAnyServer(name string, qtype uint16, conf *dns.ClientConfig, logger log.Logger) (*dns.Msg, error) { client := &dns.Client{} - response := &dns.Msg{} for _, server := range conf.Servers { servAddr := net.JoinHostPort(server, conf.Port) - for _, lname := range conf.NameList(name) { - response, err = lookup(lname, qtype, client, servAddr, false) - if err != nil { - logger. - With("server", server). - With("name", name). - With("reason", err). - Warn("DNS resolution failed.") - continue - } - if len(response.Answer) > 0 { - return response, nil - } + msg, err := askServerForName(name, qtype, client, servAddr, false) + if err != nil { + logger. + With("server", server). + With("name", name). + With("reason", err). + Warn("DNS resolution failed.") + continue + } + + if msg.Rcode == dns.RcodeSuccess || msg.Rcode == dns.RcodeNameError { + // We have our answer. Time to go home. + return msg, nil } } - return response, fmt.Errorf("could not resolve %s: no server responded", name) + + return nil, fmt.Errorf("could not resolve %s: no servers returned a viable answer", name) } -func lookup(lname string, queryType uint16, client *dns.Client, servAddr string, edns bool) (*dns.Msg, error) { +// askServerForName makes a request to a specific DNS server for a specific +// name (and qtype). Retries in the event of response truncation, but +// otherwise just sends back whatever the server gave, whether that be a +// valid-looking response, or an error. +func askServerForName(name string, queryType uint16, client *dns.Client, servAddr string, edns bool) (*dns.Msg, error) { msg := &dns.Msg{} - msg.SetQuestion(dns.Fqdn(lname), queryType) + msg.SetQuestion(dns.Fqdn(name), queryType) if edns { msg.SetEdns0(dns.DefaultMsgSize, false) } @@ -214,7 +287,7 @@ func lookup(lname string, queryType uint16, client *dns.Client, servAddr string, if edns { // Truncated even though EDNS is used client.Net = "tcp" } - return lookup(lname, queryType, client, servAddr, !edns) + return askServerForName(name, queryType, client, servAddr, !edns) } if err != nil { return nil, err