Skip to content

Commit

Permalink
Rename ranking evaluation quality_level to metric_score (#32168)
Browse files Browse the repository at this point in the history
The notion of "quality" is an overloaded term in the search ranking evaluation 
context. Its usually used to decribe certain levels of "good" vs. "bad" of a 
seach result with respect to the users information need. We currently report the 
result of the ranking evaluation as `quality_level` which is a bit missleading.
This changes the response parameter name to `metric_score` which fits better.
  • Loading branch information
Christoph Büscher committed Jul 23, 2018
1 parent 82c9bc0 commit 2632d1d
Show file tree
Hide file tree
Showing 20 changed files with 114 additions and 117 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ public void testRankEvalRequest() throws IOException {
highLevelClient()::rankEval, highLevelClient()::rankEvalAsync);
// the expected Prec@ for the first query is 5/7 and the expected Prec@ for the second is 1/7, divided by 2 to get the average
double expectedPrecision = (1.0 / 7.0 + 5.0 / 7.0) / 2.0;
assertEquals(expectedPrecision, response.getEvaluationResult(), Double.MIN_VALUE);
assertEquals(expectedPrecision, response.getMetricScore(), Double.MIN_VALUE);
Map<String, EvalQueryQuality> partialResults = response.getPartialResults();
assertEquals(2, partialResults.size());
EvalQueryQuality amsterdamQueryQuality = partialResults.get("amsterdam_query");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1136,14 +1136,14 @@ public void testRankEval() throws Exception {
// end::rank-eval-execute

// tag::rank-eval-response
double evaluationResult = response.getEvaluationResult(); // <1>
double evaluationResult = response.getMetricScore(); // <1>
assertEquals(1.0 / 3.0, evaluationResult, 0.0);
Map<String, EvalQueryQuality> partialResults =
response.getPartialResults();
EvalQueryQuality evalQuality =
partialResults.get("kimchy_query"); // <2>
assertEquals("kimchy_query", evalQuality.getId());
double qualityLevel = evalQuality.getQualityLevel(); // <3>
double qualityLevel = evalQuality.metricScore(); // <3>
assertEquals(1.0 / 3.0, qualityLevel, 0.0);
List<RatedSearchHit> hitsAndRatings = evalQuality.getHitsAndRatings();
RatedSearchHit ratedSearchHit = hitsAndRatings.get(0);
Expand Down
6 changes: 3 additions & 3 deletions docs/reference/search/rank-eval.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,10 @@ that shows potential errors of individual queries. The response has the followin
--------------------------------
{
"rank_eval": {
"quality_level": 0.4, <1>
"metric_score": 0.4, <1>
"details": {
"my_query_id1": { <2>
"quality_level": 0.6, <3>
"metric_score": 0.6, <3>
"unrated_docs": [ <4>
{
"_index": "my_index",
Expand Down Expand Up @@ -312,7 +312,7 @@ that shows potential errors of individual queries. The response has the followin

<1> the overall evaluation quality calculated by the defined metric
<2> the `details` section contains one entry for every query in the original `requests` section, keyed by the search request id
<3> the `quality_level` in the `details` section shows the contribution of this query to the global quality score
<3> the `metric_score` in the `details` section shows the contribution of this query to the global quality metric score
<4> the `unrated_docs` section contains an `_index` and `_id` entry for each document in the search result for this
query that didn't have a ratings value. This can be used to ask the user to supply ratings for these documents
<5> the `hits` section shows a grouping of the search results with their supplied rating
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,6 @@ public Optional<Integer> forcedSearchSize() {
@Override
public EvalQueryQuality evaluate(String taskId, SearchHit[] hits,
List<RatedDocument> ratedDocs) {
List<Integer> allRatings = ratedDocs.stream().mapToInt(RatedDocument::getRating).boxed()
.collect(Collectors.toList());
List<RatedSearchHit> ratedHits = joinHitsWithRatings(hits, ratedDocs);
List<Integer> ratingsInSearchHits = new ArrayList<>(ratedHits.size());
int unratedResults = 0;
Expand All @@ -144,6 +142,8 @@ public EvalQueryQuality evaluate(String taskId, SearchHit[] hits,
double idcg = 0;

if (normalize) {
List<Integer> allRatings = ratedDocs.stream().mapToInt(RatedDocument::getRating).boxed()
.collect(Collectors.toList());
Collections.sort(allRatings, Comparator.nullsLast(Collections.reverseOrder()));
idcg = computeDCG(allRatings.subList(0, Math.min(ratingsInSearchHits.size(), allRatings.size())));
if (idcg != 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,35 +41,35 @@
public class EvalQueryQuality implements ToXContentFragment, Writeable {

private final String queryId;
private final double evaluationResult;
private final double metricScore;
private MetricDetail optionalMetricDetails;
private final List<RatedSearchHit> ratedHits;

public EvalQueryQuality(String id, double evaluationResult) {
public EvalQueryQuality(String id, double metricScore) {
this.queryId = id;
this.evaluationResult = evaluationResult;
this.metricScore = metricScore;
this.ratedHits = new ArrayList<>();
}

public EvalQueryQuality(StreamInput in) throws IOException {
this.queryId = in.readString();
this.evaluationResult = in.readDouble();
this.metricScore = in.readDouble();
this.ratedHits = in.readList(RatedSearchHit::new);
this.optionalMetricDetails = in.readOptionalNamedWriteable(MetricDetail.class);
}

// only used for parsing internally
private EvalQueryQuality(String queryId, ParsedEvalQueryQuality builder) {
this.queryId = queryId;
this.evaluationResult = builder.evaluationResult;
this.metricScore = builder.evaluationResult;
this.optionalMetricDetails = builder.optionalMetricDetails;
this.ratedHits = builder.ratedHits;
}

@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(queryId);
out.writeDouble(evaluationResult);
out.writeDouble(metricScore);
out.writeList(ratedHits);
out.writeOptionalNamedWriteable(this.optionalMetricDetails);
}
Expand All @@ -78,8 +78,8 @@ public String getId() {
return queryId;
}

public double getQualityLevel() {
return evaluationResult;
public double metricScore() {
return metricScore;
}

public void setMetricDetails(MetricDetail breakdown) {
Expand All @@ -101,7 +101,7 @@ public List<RatedSearchHit> getHitsAndRatings() {
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(queryId);
builder.field(QUALITY_LEVEL_FIELD.getPreferredName(), this.evaluationResult);
builder.field(METRIC_SCORE_FIELD.getPreferredName(), this.metricScore);
builder.startArray(UNRATED_DOCS_FIELD.getPreferredName());
for (DocumentKey key : EvaluationMetric.filterUnratedDocuments(ratedHits)) {
builder.startObject();
Expand All @@ -122,7 +122,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
return builder;
}

private static final ParseField QUALITY_LEVEL_FIELD = new ParseField("quality_level");
static final ParseField METRIC_SCORE_FIELD = new ParseField("metric_score");
private static final ParseField UNRATED_DOCS_FIELD = new ParseField("unrated_docs");
private static final ParseField HITS_FIELD = new ParseField("hits");
private static final ParseField METRIC_DETAILS_FIELD = new ParseField("metric_details");
Expand All @@ -136,7 +136,7 @@ private static class ParsedEvalQueryQuality {
}

static {
PARSER.declareDouble((obj, value) -> obj.evaluationResult = value, QUALITY_LEVEL_FIELD);
PARSER.declareDouble((obj, value) -> obj.evaluationResult = value, METRIC_SCORE_FIELD);
PARSER.declareObject((obj, value) -> obj.optionalMetricDetails = value, (p, c) -> parseMetricDetail(p),
METRIC_DETAILS_FIELD);
PARSER.declareObjectArray((obj, list) -> obj.ratedHits = list, (p, c) -> RatedSearchHit.parse(p), HITS_FIELD);
Expand Down Expand Up @@ -164,13 +164,13 @@ public final boolean equals(Object obj) {
}
EvalQueryQuality other = (EvalQueryQuality) obj;
return Objects.equals(queryId, other.queryId) &&
Objects.equals(evaluationResult, other.evaluationResult) &&
Objects.equals(metricScore, other.metricScore) &&
Objects.equals(ratedHits, other.ratedHits) &&
Objects.equals(optionalMetricDetails, other.optionalMetricDetails);
}

@Override
public final int hashCode() {
return Objects.hash(queryId, evaluationResult, ratedHits, optionalMetricDetails);
return Objects.hash(queryId, metricScore, ratedHits, optionalMetricDetails);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,22 @@
public interface EvaluationMetric extends ToXContentObject, NamedWriteable {

/**
* Returns a single metric representing the ranking quality of a set of returned
* documents wrt. to a set of document ids labeled as relevant for this search.
* Evaluates a single ranking evaluation case.
*
* @param taskId
* the id of the query for which the ranking is currently evaluated
* an identifier of the query for which the search ranking is
* evaluated
* @param hits
* the result hits as returned by a search request
* the search result hits
* @param ratedDocs
* the documents that were ranked by human annotators for this query
* case
* @return some metric representing the quality of the result hit list wrt. to
* relevant doc ids.
* the documents that contain the document rating for this query case
* @return an {@link EvalQueryQuality} instance that contains the metric score
* with respect to the provided search hits and ratings
*/
EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs);

/**
* join hits with rated documents using the joint _index/_id document key
* Joins hits with rated documents using the joint _index/_id document key.
*/
static List<RatedSearchHit> joinHitsWithRatings(SearchHit[] hits, List<RatedDocument> ratedDocs) {
Map<DocumentKey, RatedDocument> ratedDocumentMap = ratedDocs.stream()
Expand All @@ -74,19 +73,19 @@ static List<RatedSearchHit> joinHitsWithRatings(SearchHit[] hits, List<RatedDocu
}

/**
* filter @link {@link RatedSearchHit} that don't have a rating
* Filter {@link RatedSearchHit}s that do not have a rating.
*/
static List<DocumentKey> filterUnratedDocuments(List<RatedSearchHit> ratedHits) {
return ratedHits.stream().filter(hit -> hit.getRating().isPresent() == false)
.map(hit -> new DocumentKey(hit.getSearchHit().getIndex(), hit.getSearchHit().getId())).collect(Collectors.toList());
}

/**
* how evaluation metrics for particular search queries get combined for the overall evaluation score.
* Defaults to averaging over the partial results.
* Combine several {@link EvalQueryQuality} results into the overall evaluation score.
* This defaults to averaging over the partial results, but can be overwritten to obtain a different behavior.
*/
default double combine(Collection<EvalQueryQuality> partialResults) {
return partialResults.stream().mapToDouble(EvalQueryQuality::getQualityLevel).sum() / partialResults.size();
return partialResults.stream().mapToDouble(EvalQueryQuality::metricScore).sum() / partialResults.size();
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,7 @@ public int getRelevantRatingThreshold() {
* Compute ReciprocalRank based on provided relevant document IDs.
**/
@Override
public EvalQueryQuality evaluate(String taskId, SearchHit[] hits,
List<RatedDocument> ratedDocs) {
public EvalQueryQuality evaluate(String taskId, SearchHit[] hits, List<RatedDocument> ratedDocs) {
List<RatedSearchHit> ratedHits = joinHitsWithRatings(hits, ratedDocs);
int firstRelevant = -1;
int rank = 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@
public class RankEvalResponse extends ActionResponse implements ToXContentObject {

/** The overall evaluation result. */
private double evaluationResult;
private double metricScore;
/** details about individual ranking evaluation queries, keyed by their id */
private Map<String, EvalQueryQuality> details;
/** exceptions for specific ranking evaluation queries, keyed by their id */
private Map<String, Exception> failures;

public RankEvalResponse(double qualityLevel, Map<String, EvalQueryQuality> partialResults,
public RankEvalResponse(double metricScore, Map<String, EvalQueryQuality> partialResults,
Map<String, Exception> failures) {
this.evaluationResult = qualityLevel;
this.metricScore = metricScore;
this.details = new HashMap<>(partialResults);
this.failures = new HashMap<>(failures);
}
Expand All @@ -65,8 +65,8 @@ public RankEvalResponse(double qualityLevel, Map<String, EvalQueryQuality> parti
// only used in RankEvalAction#newResponse()
}

public double getEvaluationResult() {
return evaluationResult;
public double getMetricScore() {
return metricScore;
}

public Map<String, EvalQueryQuality> getPartialResults() {
Expand All @@ -85,7 +85,7 @@ public String toString() {
@Override
public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
out.writeDouble(evaluationResult);
out.writeDouble(metricScore);
out.writeVInt(details.size());
for (String queryId : details.keySet()) {
out.writeString(queryId);
Expand All @@ -101,7 +101,7 @@ public void writeTo(StreamOutput out) throws IOException {
@Override
public void readFrom(StreamInput in) throws IOException {
super.readFrom(in);
this.evaluationResult = in.readDouble();
this.metricScore = in.readDouble();
int partialResultSize = in.readVInt();
this.details = new HashMap<>(partialResultSize);
for (int i = 0; i < partialResultSize; i++) {
Expand All @@ -120,7 +120,7 @@ public void readFrom(StreamInput in) throws IOException {
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
builder.field("quality_level", evaluationResult);
builder.field("metric_score", metricScore);
builder.startObject("details");
for (String key : details.keySet()) {
details.get(key).toXContent(builder, params);
Expand All @@ -137,7 +137,6 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
return builder;
}

private static final ParseField QUALITY_LEVEL_FIELD = new ParseField("quality_level");
private static final ParseField DETAILS_FIELD = new ParseField("details");
private static final ParseField FAILURES_FIELD = new ParseField("failures");
@SuppressWarnings("unchecked")
Expand All @@ -147,7 +146,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
((List<EvalQueryQuality>) a[1]).stream().collect(Collectors.toMap(EvalQueryQuality::getId, Function.identity())),
((List<Tuple<String, Exception>>) a[2]).stream().collect(Collectors.toMap(Tuple::v1, Tuple::v2))));
static {
PARSER.declareDouble(ConstructingObjectParser.constructorArg(), QUALITY_LEVEL_FIELD);
PARSER.declareDouble(ConstructingObjectParser.constructorArg(), EvalQueryQuality.METRIC_SCORE_FIELD);
PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(), (p, c, n) -> EvalQueryQuality.fromXContent(p, n),
DETAILS_FIELD);
PARSER.declareNamedObjects(ConstructingObjectParser.optionalConstructorArg(), (p, c, n) -> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ public void testDCGAt() {
hits[i].shard(new SearchShardTarget("testnode", new Index("index", "uuid"), 0, null));
}
DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
assertEquals(EXPECTED_DCG, dcg.evaluate("id", hits, rated).getQualityLevel(), DELTA);
assertEquals(EXPECTED_DCG, dcg.evaluate("id", hits, rated).metricScore(), DELTA);

/**
* Check with normalization: to get the maximal possible dcg, sort documents by
Expand All @@ -94,7 +94,7 @@ public void testDCGAt() {
* idcg = 14.595390756454922 (sum of last column)
*/
dcg = new DiscountedCumulativeGain(true, null, 10);
assertEquals(EXPECTED_NDCG, dcg.evaluate("id", hits, rated).getQualityLevel(), DELTA);
assertEquals(EXPECTED_NDCG, dcg.evaluate("id", hits, rated).metricScore(), DELTA);
}

/**
Expand Down Expand Up @@ -127,7 +127,7 @@ public void testDCGAtSixMissingRatings() {
}
DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
EvalQueryQuality result = dcg.evaluate("id", hits, rated);
assertEquals(12.779642067948913, result.getQualityLevel(), DELTA);
assertEquals(12.779642067948913, result.metricScore(), DELTA);
assertEquals(2, filterUnratedDocuments(result.getHitsAndRatings()).size());

/**
Expand All @@ -146,7 +146,7 @@ public void testDCGAtSixMissingRatings() {
* idcg = 13.347184833073591 (sum of last column)
*/
dcg = new DiscountedCumulativeGain(true, null, 10);
assertEquals(12.779642067948913 / 13.347184833073591, dcg.evaluate("id", hits, rated).getQualityLevel(), DELTA);
assertEquals(12.779642067948913 / 13.347184833073591, dcg.evaluate("id", hits, rated).metricScore(), DELTA);
}

/**
Expand Down Expand Up @@ -184,7 +184,7 @@ public void testDCGAtFourMoreRatings() {
}
DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
EvalQueryQuality result = dcg.evaluate("id", hits, ratedDocs);
assertEquals(12.392789260714371, result.getQualityLevel(), DELTA);
assertEquals(12.392789260714371, result.metricScore(), DELTA);
assertEquals(1, filterUnratedDocuments(result.getHitsAndRatings()).size());

/**
Expand All @@ -204,7 +204,7 @@ public void testDCGAtFourMoreRatings() {
* idcg = 13.347184833073591 (sum of last column)
*/
dcg = new DiscountedCumulativeGain(true, null, 10);
assertEquals(12.392789260714371 / 13.347184833073591, dcg.evaluate("id", hits, ratedDocs).getQualityLevel(), DELTA);
assertEquals(12.392789260714371 / 13.347184833073591, dcg.evaluate("id", hits, ratedDocs).metricScore(), DELTA);
}

/**
Expand All @@ -223,13 +223,13 @@ public void testNoResults() throws Exception {
SearchHit[] hits = new SearchHit[0];
DiscountedCumulativeGain dcg = new DiscountedCumulativeGain();
EvalQueryQuality result = dcg.evaluate("id", hits, ratedDocs);
assertEquals(0.0d, result.getQualityLevel(), DELTA);
assertEquals(0.0d, result.metricScore(), DELTA);
assertEquals(0, filterUnratedDocuments(result.getHitsAndRatings()).size());

// also check normalized
dcg = new DiscountedCumulativeGain(true, null, 10);
result = dcg.evaluate("id", hits, ratedDocs);
assertEquals(0.0d, result.getQualityLevel(), DELTA);
assertEquals(0.0d, result.metricScore(), DELTA);
assertEquals(0, filterUnratedDocuments(result.getHitsAndRatings()).size());
}

Expand Down
Loading

0 comments on commit 2632d1d

Please sign in to comment.