Rename ranking evaluation quality_level to metric_score (#32168)

The notion of "quality" is an overloaded term in the search ranking evaluation context. Its usually used to decribe certain levels of "good" vs. "bad" of a seach result with respect to the users information need. We currently report the result of the ranking evaluation as `quality_level` which is a bit missleading. This changes the response parameter name to `metric_score` which fits better.
elastic · Jul 23, 2018 · 2632d1d · 2632d1d
1 parent 82c9bc0
commit 2632d1d
Show file tree

Hide file tree

Showing 20 changed files with 114 additions and 117 deletions.
diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/RankEvalIT.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/RankEvalIT.java
@@ -81,7 +81,7 @@ public void testRankEvalRequest() throws IOException {
                 highLevelClient()::rankEval, highLevelClient()::rankEvalAsync);
         // the expected Prec@ for the first query is 5/7 and the expected Prec@ for the second is 1/7, divided by 2 to get the average
         double expectedPrecision = (1.0 / 7.0 + 5.0 / 7.0) / 2.0;
-        assertEquals(expectedPrecision, response.getEvaluationResult(), Double.MIN_VALUE);
+        assertEquals(expectedPrecision, response.getMetricScore(), Double.MIN_VALUE);
         Map<String, EvalQueryQuality> partialResults = response.getPartialResults();
         assertEquals(2, partialResults.size());
         EvalQueryQuality amsterdamQueryQuality = partialResults.get("amsterdam_query");

diff --git a/...igh-level/src/test/java/org/elasticsearch/client/documentation/SearchDocumentationIT.java b/...igh-level/src/test/java/org/elasticsearch/client/documentation/SearchDocumentationIT.java
@@ -1136,14 +1136,14 @@ public void testRankEval() throws Exception {
             // end::rank-eval-execute
 
             // tag::rank-eval-response
-            double evaluationResult = response.getEvaluationResult();   // <1>
+            double evaluationResult = response.getMetricScore();   // <1>
             assertEquals(1.0 / 3.0, evaluationResult, 0.0);
             Map<String, EvalQueryQuality> partialResults =
                     response.getPartialResults();
             EvalQueryQuality evalQuality =
                     partialResults.get("kimchy_query");                 // <2>
             assertEquals("kimchy_query", evalQuality.getId());
-            double qualityLevel = evalQuality.getQualityLevel();        // <3>
+            double qualityLevel = evalQuality.metricScore();        // <3>
             assertEquals(1.0 / 3.0, qualityLevel, 0.0);
             List<RatedSearchHit> hitsAndRatings = evalQuality.getHitsAndRatings();
             RatedSearchHit ratedSearchHit = hitsAndRatings.get(0);

diff --git a/docs/reference/search/rank-eval.asciidoc b/docs/reference/search/rank-eval.asciidoc
@@ -274,10 +274,10 @@ that shows potential errors of individual queries. The response has the followin
 --------------------------------
 {
     "rank_eval": {
-        "quality_level": 0.4, <1>
+        "metric_score": 0.4, <1>
         "details": {  
             "my_query_id1": { <2>
-                "quality_level": 0.6, <3>
+                "metric_score": 0.6, <3>
                 "unrated_docs": [ <4>
                     {
                         "_index": "my_index",
@@ -312,7 +312,7 @@ that shows potential errors of individual queries. The response has the followin
 
 <1> the overall evaluation quality calculated by the defined metric
 <2> the `details` section contains one entry for every query in the original `requests` section, keyed by the search request id
-<3> the `quality_level` in the `details` section shows the contribution of this query to the global quality score
+<3> the `metric_score` in the `details` section shows the contribution of this query to the global quality metric score
 <4> the `unrated_docs` section contains an `_index` and `_id` entry for each document in the search result for this
 query that didn't have a ratings value. This can be used to ask the user to supply ratings for these documents
 <5> the `hits` section shows a grouping of the search results with their supplied rating

diff --git a/...es/rank-eval/src/main/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGain.java b/...es/rank-eval/src/main/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGain.java
@@ -126,8 +126,6 @@ public Optional<Integer> forcedSearchSize() {
     @Override
     public EvalQueryQuality evaluate(String taskId, SearchHit[] hits,
             List<RatedDocument> ratedDocs) {
-        List<Integer> allRatings = ratedDocs.stream().mapToInt(RatedDocument::getRating).boxed()
-                .collect(Collectors.toList());
         List<RatedSearchHit> ratedHits = joinHitsWithRatings(hits, ratedDocs);
         List<Integer> ratingsInSearchHits = new ArrayList<>(ratedHits.size());
         int unratedResults = 0;
@@ -144,6 +142,8 @@ public EvalQueryQuality evaluate(String taskId, SearchHit[] hits,
         double idcg = 0;
 
         if (normalize) {
+            List<Integer> allRatings = ratedDocs.stream().mapToInt(RatedDocument::getRating).boxed()
+                    .collect(Collectors.toList());
             Collections.sort(allRatings, Comparator.nullsLast(Collections.reverseOrder()));
             idcg = computeDCG(allRatings.subList(0, Math.min(ratingsInSearchHits.size(), allRatings.size())));
             if (idcg != 0) {

diff --git a/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/EvalQueryQuality.java b/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/EvalQueryQuality.java
@@ -41,35 +41,35 @@
 public class EvalQueryQuality implements ToXContentFragment, Writeable {
 
     private final String queryId;
-    private final double evaluationResult;
+    private final double metricScore;
     private MetricDetail optionalMetricDetails;
     private final List<RatedSearchHit> ratedHits;
 
-    public EvalQueryQuality(String id, double evaluationResult) {
+    public EvalQueryQuality(String id, double metricScore) {
         this.queryId = id;
-        this.evaluationResult = evaluationResult;
+        this.metricScore = metricScore;
         this.ratedHits = new ArrayList<>();
     }
 
     public EvalQueryQuality(StreamInput in) throws IOException {
         this.queryId = in.readString();
-        this.evaluationResult = in.readDouble();
+        this.metricScore = in.readDouble();
         this.ratedHits = in.readList(RatedSearchHit::new);
         this.optionalMetricDetails = in.readOptionalNamedWriteable(MetricDetail.class);
     }
 
     // only used for parsing internally
     private EvalQueryQuality(String queryId, ParsedEvalQueryQuality builder) {
         this.queryId = queryId;
-        this.evaluationResult = builder.evaluationResult;
+        this.metricScore = builder.evaluationResult;
         this.optionalMetricDetails = builder.optionalMetricDetails;
         this.ratedHits = builder.ratedHits;
     }
 
     @Override
     public void writeTo(StreamOutput out) throws IOException {
         out.writeString(queryId);
-        out.writeDouble(evaluationResult);
+        out.writeDouble(metricScore);
         out.writeList(ratedHits);
         out.writeOptionalNamedWriteable(this.optionalMetricDetails);
     }
@@ -78,8 +78,8 @@ public String getId() {
         return queryId;
     }
 
-    public double getQualityLevel() {
-        return evaluationResult;
+    public double metricScore() {
+        return metricScore;
     }
 
     public void setMetricDetails(MetricDetail breakdown) {
@@ -101,7 +101,7 @@ public List<RatedSearchHit> getHitsAndRatings() {
     @Override
     public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
         builder.startObject(queryId);
-        builder.field(QUALITY_LEVEL_FIELD.getPreferredName(), this.evaluationResult);
+        builder.field(METRIC_SCORE_FIELD.getPreferredName(), this.metricScore);
         builder.startArray(UNRATED_DOCS_FIELD.getPreferredName());
         for (DocumentKey key : EvaluationMetric.filterUnratedDocuments(ratedHits)) {
             builder.startObject();
@@ -122,7 +122,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
         return builder;
     }
 
-    private static final ParseField QUALITY_LEVEL_FIELD = new ParseField("quality_level");
+    static final ParseField METRIC_SCORE_FIELD = new ParseField("metric_score");
     private static final ParseField UNRATED_DOCS_FIELD = new ParseField("unrated_docs");
     private static final ParseField HITS_FIELD = new ParseField("hits");
     private static final ParseField METRIC_DETAILS_FIELD = new ParseField("metric_details");
@@ -136,7 +136,7 @@ private static class ParsedEvalQueryQuality {
     }
 
     static {
-        PARSER.declareDouble((obj, value) -> obj.evaluationResult = value, QUALITY_LEVEL_FIELD);
+        PARSER.declareDouble((obj, value) -> obj.evaluationResult = value, METRIC_SCORE_FIELD);
         PARSER.declareObject((obj, value) -> obj.optionalMetricDetails = value, (p, c) -> parseMetricDetail(p),
                 METRIC_DETAILS_FIELD);
         PARSER.declareObjectArray((obj, list) -> obj.ratedHits = list, (p, c) -> RatedSearchHit.parse(p), HITS_FIELD);
@@ -164,13 +164,13 @@ public final boolean equals(Object obj) {
         }
         EvalQueryQuality other = (EvalQueryQuality) obj;
         return Objects.equals(queryId, other.queryId) &&
-                Objects.equals(evaluationResult, other.evaluationResult) &&
+                Objects.equals(metricScore, other.metricScore) &&
                 Objects.equals(ratedHits, other.ratedHits) &&
                 Objects.equals(optionalMetricDetails, other.optionalMetricDetails);
     }
 
     @Override
     public final int hashCode() {
-        return Objects.hash(queryId, evaluationResult, ratedHits, optionalMetricDetails);
+        return Objects.hash(queryId, metricScore, ratedHits, optionalMetricDetails);
     }
 }
diff --git a/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/EvaluationMetric.java b/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/EvaluationMetric.java
@@ -39,23 +39,22 @@
 public interface EvaluationMetric extends ToXContentObject, NamedWriteable {
 
     /**
-     * Returns a single metric representing the ranking quality of a set of returned
-     * documents wrt. to a set of document ids labeled as relevant for this search.
+     * Evaluates a single ranking evaluation case.
      *
      * @param taskId
-     *            the id of the query for which the ranking is currently evaluated
+     *            an identifier of the query for which the search ranking is
+     *            evaluated
      * @param hits
-     *            the result hits as returned by a search request
+     *            the search result hits
      * @param ratedDocs
-     *            the documents that were ranked by human annotators for this query
-     *            case
-     * @return some metric representing the quality of the result hit list wrt. to
-     *         relevant doc ids.
+     *            the documents that contain the document rating for this query case
+     * @return an {@link EvalQueryQuality} instance that contains the metric score
+     *         with respect to the provided search hits and ratings
      */
     EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);
 
     /**
-     * join hits with rated documents using the joint _index/_id document key
+     * Joins hits with rated documents using the joint _index/_id document key.
      */
     static List<RatedSearchHit> joinHitsWithRatings(SearchHit[] hits, List<RatedDocument> ratedDocs) {
         Map<DocumentKey, RatedDocument> ratedDocumentMap = ratedDocs.stream()
@@ -74,19 +73,19 @@ static List<RatedSearchHit> joinHitsWithRatings(SearchHit[] hits, List<RatedDocu
     }
 
     /**
-     * filter @link {@link RatedSearchHit} that don't have a rating
+     * Filter {@link RatedSearchHit}s that do not have a rating.
      */
     static List<DocumentKey> filterUnratedDocuments(List<RatedSearchHit> ratedHits) {
         return ratedHits.stream().filter(hit -> hit.getRating().isPresent() == false)
                 .map(hit -> new DocumentKey(hit.getSearchHit().getIndex(), hit.getSearchHit().getId())).collect(Collectors.toList());
     }
 
     /**
-     * how evaluation metrics for particular search queries get combined for the overall evaluation score.
-     * Defaults to averaging over the partial results.
+     * Combine several {@link EvalQueryQuality} results into the overall evaluation score.
+     * This defaults to averaging over the partial results, but can be overwritten to obtain a different behavior.
      */
     default double combine(Collection<EvalQueryQuality> partialResults) {
-        return partialResults.stream().mapToDouble(EvalQueryQuality::getQualityLevel).sum() / partialResults.size();
+        return partialResults.stream().mapToDouble(EvalQueryQuality::metricScore).sum() / partialResults.size();
     }
 
     /**

diff --git a/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/MeanReciprocalRank.java b/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/MeanReciprocalRank.java
@@ -110,8 +110,7 @@ public int getRelevantRatingThreshold() {
      * Compute ReciprocalRank based on provided relevant document IDs.
      **/
     @Override
-    public EvalQueryQuality evaluate(String taskId, SearchHit[] hits,
-            List<RatedDocument> ratedDocs) {
+    public EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs) {
         List<RatedSearchHit> ratedHits = joinHitsWithRatings(hits, ratedDocs);
         int firstRelevant = -1;
         int rank = 1;

diff --git a/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/RankEvalResponse.java b/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/RankEvalResponse.java
@@ -48,15 +48,15 @@
 public class RankEvalResponse extends ActionResponse implements ToXContentObject {
 
     /** The overall evaluation result. */
-    private double evaluationResult;
+    private double metricScore;
     /** details about individual ranking evaluation queries, keyed by their id */
     private Map<String, EvalQueryQuality> details;
     /** exceptions for specific ranking evaluation queries, keyed by their id */
     private Map<String, Exception> failures;
 
-    public RankEvalResponse(double qualityLevel, Map<String, EvalQueryQuality> partialResults,
+    public RankEvalResponse(double metricScore, Map<String, EvalQueryQuality> partialResults,
             Map<String, Exception> failures) {
-        this.evaluationResult = qualityLevel;
+        this.metricScore = metricScore;
         this.details =  new HashMap<>(partialResults);
         this.failures = new HashMap<>(failures);
     }
@@ -65,8 +65,8 @@ public RankEvalResponse(double qualityLevel, Map<String, EvalQueryQuality> parti
         // only used in RankEvalAction#newResponse()
     }
 
-    public double getEvaluationResult() {
-        return evaluationResult;
+    public double getMetricScore() {
+        return metricScore;
     }
 
     public Map<String, EvalQueryQuality> getPartialResults() {
@@ -85,7 +85,7 @@ public String toString() {
     @Override
     public void writeTo(StreamOutput out) throws IOException {
         super.writeTo(out);
-        out.writeDouble(evaluationResult);
+        out.writeDouble(metricScore);
         out.writeVInt(details.size());
         for (String queryId : details.keySet()) {
             out.writeString(queryId);
@@ -101,7 +101,7 @@ public void writeTo(StreamOutput out) throws IOException {
     @Override
     public void readFrom(StreamInput in) throws IOException {
         super.readFrom(in);
-        this.evaluationResult = in.readDouble();
+        this.metricScore = in.readDouble();
         int partialResultSize = in.readVInt();
         this.details = new HashMap<>(partialResultSize);
         for (int i = 0; i < partialResultSize; i++) {
@@ -120,7 +120,7 @@ public void readFrom(StreamInput in) throws IOException {
     @Override
     public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
         builder.startObject();
-        builder.field("quality_level", evaluationResult);
+        builder.field("metric_score", metricScore);
         builder.startObject("details");
         for (String key : details.keySet()) {
             details.get(key).toXContent(builder, params);
@@ -137,7 +137,6 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
         return builder;
     }
 
-    private static final ParseField QUALITY_LEVEL_FIELD = new ParseField("quality_level");
     private static final ParseField DETAILS_FIELD = new ParseField("details");
     private static final ParseField FAILURES_FIELD = new ParseField("failures");
     @SuppressWarnings("unchecked")
@@ -147,7 +146,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
                     ((List<EvalQueryQuality>) a[1]).stream().collect(Collectors.toMap(EvalQueryQuality::getId, Function.identity())),
                     ((List<Tuple<String, Exception>>) a[2]).stream().collect(Collectors.toMap(Tuple::v1, Tuple::v2))));
     static {
-        PARSER.declareDouble(ConstructingObjectParser.constructorArg(), QUALITY_LEVEL_FIELD);
+        PARSER.declareDouble(ConstructingObjectParser.constructorArg(), EvalQueryQuality.METRIC_SCORE_FIELD);
         PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(), (p, c, n) -> EvalQueryQuality.fromXContent(p, n),
                 DETAILS_FIELD);
         PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(), (p, c, n) -> {

diff --git a/...nk-eval/src/test/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainTests.java b/...nk-eval/src/test/java/org/elasticsearch/index/rankeval/DiscountedCumulativeGainTests.java
@@ -76,7 +76,7 @@ public void testDCGAt() {
             hits[i].shard(new SearchShardTarget("testnode", new Index("index", "uuid"), 0, null));
         }
         DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
-        assertEquals(EXPECTED_DCG, dcg.evaluate("id", hits, rated).getQualityLevel(), DELTA);
+        assertEquals(EXPECTED_DCG, dcg.evaluate("id", hits, rated).metricScore(), DELTA);
 
         /**
          * Check with normalization: to get the maximal possible dcg, sort documents by
@@ -94,7 +94,7 @@ public void testDCGAt() {
          * idcg = 14.595390756454922 (sum of last column)
          */
         dcg = new DiscountedCumulativeGain(true, null, 10);
-        assertEquals(EXPECTED_NDCG, dcg.evaluate("id", hits, rated).getQualityLevel(), DELTA);
+        assertEquals(EXPECTED_NDCG, dcg.evaluate("id", hits, rated).metricScore(), DELTA);
     }
 
     /**
@@ -127,7 +127,7 @@ public void testDCGAtSixMissingRatings() {
         }
         DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
         EvalQueryQuality result = dcg.evaluate("id", hits, rated);
-        assertEquals(12.779642067948913, result.getQualityLevel(), DELTA);
+        assertEquals(12.779642067948913, result.metricScore(), DELTA);
         assertEquals(2, filterUnratedDocuments(result.getHitsAndRatings()).size());
 
         /**
@@ -146,7 +146,7 @@ public void testDCGAtSixMissingRatings() {
          * idcg = 13.347184833073591 (sum of last column)
          */
         dcg = new DiscountedCumulativeGain(true, null, 10);
-        assertEquals(12.779642067948913 / 13.347184833073591, dcg.evaluate("id", hits, rated).getQualityLevel(), DELTA);
+        assertEquals(12.779642067948913 / 13.347184833073591, dcg.evaluate("id", hits, rated).metricScore(), DELTA);
     }
 
     /**
@@ -184,7 +184,7 @@ public void testDCGAtFourMoreRatings() {
         }
         DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
         EvalQueryQuality result = dcg.evaluate("id", hits, ratedDocs);
-        assertEquals(12.392789260714371, result.getQualityLevel(), DELTA);
+        assertEquals(12.392789260714371, result.metricScore(), DELTA);
         assertEquals(1, filterUnratedDocuments(result.getHitsAndRatings()).size());
 
         /**
@@ -204,7 +204,7 @@ public void testDCGAtFourMoreRatings() {
          * idcg = 13.347184833073591 (sum of last column)
          */
         dcg = new DiscountedCumulativeGain(true, null, 10);
-        assertEquals(12.392789260714371 / 13.347184833073591, dcg.evaluate("id", hits, ratedDocs).getQualityLevel(), DELTA);
+        assertEquals(12.392789260714371 / 13.347184833073591, dcg.evaluate("id", hits, ratedDocs).metricScore(), DELTA);
     }
 
     /**
@@ -223,13 +223,13 @@ public void testNoResults() throws Exception {
         SearchHit[] hits = new SearchHit[0];
         DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
         EvalQueryQuality result = dcg.evaluate("id", hits, ratedDocs);
-        assertEquals(0.0d, result.getQualityLevel(), DELTA);
+        assertEquals(0.0d, result.metricScore(), DELTA);
         assertEquals(0, filterUnratedDocuments(result.getHitsAndRatings()).size());
 
         // also check normalized
         dcg = new DiscountedCumulativeGain(true, null, 10);
         result = dcg.evaluate("id", hits, ratedDocs);
-        assertEquals(0.0d, result.getQualityLevel(), DELTA);
+        assertEquals(0.0d, result.metricScore(), DELTA);
         assertEquals(0, filterUnratedDocuments(result.getHitsAndRatings()).size());
     }