From 363f159e5ab893cd8a9400b1784cfeea3654351e Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Fri, 26 Jan 2024 11:32:57 -0500 Subject: [PATCH 1/3] Add hacker news comments to the public demo. --- lilac_hf_space.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml index 90b88212..5366d44f 100644 --- a/lilac_hf_space.yml +++ b/lilac_hf_space.yml @@ -173,6 +173,23 @@ datasets: path: - completion + - namespace: lilac + name: hncomments-1m + source: + filepaths: + - /Users/brian/dev/lilac/data/datasets/local/hncomments-duckprogress/data-00000-of-00001.parquet + sample_size: 1000000 + source_name: parquet + settings: + tags: [datasets] + ui: + media_paths: + - text + embeddings: + - embedding: gte-small + path: + - text + ## Eval datasets - namespace: lilac name: MMLU From caeaa4637fe6cd1e58d44a4d2a382a76db1d86e1 Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Fri, 26 Jan 2024 11:33:30 -0500 Subject: [PATCH 2/3] save --- lilac_hf_space.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml index 5366d44f..31860533 100644 --- a/lilac_hf_space.yml +++ b/lilac_hf_space.yml @@ -497,6 +497,11 @@ clusters: output_path: - messages__clusters + - dataset_namespace: lilac + dataset_name: hncomments-1m + input_path: + - text + # NOTE: We cluster both prompt and completion because the roblox prompts are just small sections # of code which are not extremely descriptive. - dataset_namespace: lilac From e69d5015a7d9273e335b8ee9f4df9c2fc2c21af4 Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Fri, 26 Jan 2024 12:01:13 -0500 Subject: [PATCH 3/3] save --- lilac_hf_space.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml index 31860533..c09d594c 100644 --- a/lilac_hf_space.yml +++ b/lilac_hf_space.yml @@ -176,10 +176,9 @@ datasets: - namespace: lilac name: hncomments-1m source: - filepaths: - - /Users/brian/dev/lilac/data/datasets/local/hncomments-duckprogress/data-00000-of-00001.parquet + dataset_name: OpenPipe/hacker-news sample_size: 1000000 - source_name: parquet + source_name: huggingface settings: tags: [datasets] ui: