| 1 | from humanloop import Humanloop |
| 2 | import inspect |
| 3 | import json |
| 4 | from json import JSONDecodeError |
| 5 | |
| 6 | |
| 7 | HUMANLOOP_API_KEY = "<YOUR HUMANLOOP API KEY>" |
| 8 | OPENAI_API_KEY = "<YOUR OPENAI API KEY>" |
| 9 | |
| 10 | # Initialize the Humanloop client |
| 11 | humanloop = Humanloop( |
| 12 | api_key=HUMANLOOP_API_KEY, |
| 13 | openai_api_key=OPENAI_API_KEY, |
| 14 | ) |
| 15 | |
| 16 | # Create a project |
| 17 | project = humanloop.projects.create(name="evals-guide") |
| 18 | project_id = project.id |
| 19 | |
| 20 | # Create the first model config for the project, which will automatically be deployed |
| 21 | model_config = humanloop.model_configs.register( |
| 22 | project_id=project_id, |
| 23 | model="gpt-4", |
| 24 | name="Entity extractor v0", |
| 25 | chat_template=[ |
| 26 | { |
| 27 | "role": "system", |
| 28 | "content": "Extract the name of the feature or issue the customer is describing. " |
| 29 | "Possible features are only: evaluations, experiments, fine-tuning \n" |
| 30 | "Write your response in json format as follows:" |
| 31 | ' \n {"feature": "feature requested", "issue": "description of issue"}', |
| 32 | } |
| 33 | ], |
| 34 | endpoint="chat", |
| 35 | temperature=0.5, |
| 36 | ) |
| 37 | config_id = model_config.config.id |
| 38 | |
| 39 | # Example test case data |
| 40 | data = [ |
| 41 | { |
| 42 | "messages": [ |
| 43 | { |
| 44 | "role": "user", |
| 45 | "content": "Hi Humanloop support team, I'm having trouble understanding how to use the evaluations feature in your software. Can you provide a step-by-step guide or any resources to help me get started?", |
| 46 | } |
| 47 | ], |
| 48 | "target": {"feature": "evaluations", "issue": "needs step-by-step guide"}, |
| 49 | "inputs": {}, |
| 50 | }, |
| 51 | { |
| 52 | "messages": [ |
| 53 | { |
| 54 | "role": "user", |
| 55 | "content": "Hi there, I'm interested in fine-tuning a language model using your software. Can you explain the process and provide any best practices or guidelines?", |
| 56 | } |
| 57 | ], |
| 58 | "target": { |
| 59 | "feature": "fine-tuning", |
| 60 | "issue": "process explanation and best practices", |
| 61 | }, |
| 62 | "inputs": {}, |
| 63 | }, |
| 64 | ] |
| 65 | |
| 66 | # Create a dataset |
| 67 | dataset = humanloop.datasets.create( |
| 68 | project_id=project_id, |
| 69 | name="Target feature requests", |
| 70 | description="Target feature request json extractions", |
| 71 | ) |
| 72 | |
| 73 | # Create test datapoints for the dataset |
| 74 | datapoints = humanloop.datasets.create_datapoint( |
| 75 | dataset_id=dataset.id, |
| 76 | body=data, |
| 77 | ) |
| 78 | |
| 79 | # Generate a log |
| 80 | log = humanloop.chat_deployed( |
| 81 | project_id=project_id, |
| 82 | messages=data[0]["messages"], |
| 83 | ).data[0] |
| 84 | |
| 85 | |
| 86 | # Define an evaluator |
| 87 | |
| 88 | def check_feature_json(log, testcase): |
| 89 | expected_feature = testcase["target"]["feature"] |
| 90 | |
| 91 | try: |
| 92 | # The model is expected to produce valid JSON output but it could fail to do so. |
| 93 | output = json.loads(log["output"]) |
| 94 | actual_feature = output.get("feature", None) |
| 95 | return expected_feature == actual_feature |
| 96 | |
| 97 | except JSONDecodeError: |
| 98 | # If the model didn't even produce valid JSON, then it fails |
| 99 | return False |
| 100 | |
| 101 | |
| 102 | # Try out the evalutor |
| 103 | print(f"Test case result: {check_feature_json(log, data[0])}") |
| 104 | |
| 105 | # The evaluator must be sent as a string, so we convert it first |
| 106 | json_imports = "import json\nfrom json import JSONDecodeError\n" |
| 107 | evaluator_code = json_imports + inspect.getsource(check_feature_json) |
| 108 | |
| 109 | # Send evaluator to Humanloop |
| 110 | evaluator = humanloop.evaluators.create( |
| 111 | name="Feature request json", |
| 112 | description="Validate that the json returned by the model matches the target json", |
| 113 | code=evaluator_code, |
| 114 | arguments_type="target_required", |
| 115 | return_type="boolean", |
| 116 | ) |
| 117 | |
| 118 | # Finally trigger an evaluation |
| 119 | evaluation = humanloop.evaluations.create( |
| 120 | project_id=project_id, |
| 121 | evaluator_ids=[evaluator.id], |
| 122 | config_id=config_id, |
| 123 | dataset_id=dataset_id, |
| 124 | ) |
| 125 | |
| 126 | # Now navigate to your project's evaluations tab on humanloop to inspect the results |