Break Down Reasoning Into Multiple Steps
Cumulative Reasoning1 aims to generate better outputs by dividing the reasoning process into three separate steps
- Propose : A LLM first suggests potential steps based o the current context, initiating the reasoning cycle
- Verify : We then assess the proposer's suggestions for accuracy, incorporating valid steps into the ongoing context
- Report : We then determine the appropriate moment to conclude the reasoning process
By first generating potential steps and separating out each portions of the reasoning process, we are able to obtain significant improvements in logical inference tasks and mathematical problems.
We can implement this using instructor
as seen below
import instructor
from openai import AsyncOpenAI
from pydantic import BaseModel, Field
from textwrap import dedent
from typing import Literal
import asyncio
client = instructor.from_openai(AsyncOpenAI())
class Proposition(BaseModel):
premise1: str
premise2: str
reasoning: str
proposition: str
class ProposerOutput(BaseModel):
reasoning: str
valid_propositions: list[Proposition] = Field(
description="Concise list of Propositions that are derived from the premises that are relevant to the hypothesis. Note that each Proposition is derived from two given premises at most",
min_length=4,
)
prediction: Literal["False", "True", "Unknown"]
class VerifiedProposition(BaseModel):
proposition: str
reasoning: str
is_valid: bool
class ReporterOutput(BaseModel):
reasoning: str
is_valid_hypothesis: bool
async def generate_propositions(premises: list[str], hypothesis: str) -> ProposerOutput:
formatted_premises = "\n- ".join(premises)
return await client.chat.completions.create(
messages=[
{
"role": "system",
"content": dedent(
"""
Suppose you are one of the greatest AI
scientists, logicians, and mathematicians.
Let us think step by step. Please use
First-Order Logic (FOL) to deduce a list
of Propositions. Each Proposition is
derived from two given Premises and
should be logically correct. Most
importantly, each Proposition should
not duplicate the two premises that it
is derived from. Please make sure your
reasoning is directly deduced from the
Premises and Propositions rather than
introducing unsourced common knowledge
and unsourced information by common
sense reasoning.
"""
),
},
{
"role": "user",
"content": dedent(
f"""
Premises:
{formatted_premises}
We want to deduce more Propositions to
determine the correctness of the following
Hypothesis:
Hypothesis: {hypothesis}
"""
),
},
],
response_model=ProposerOutput,
model="gpt-4o",
)
async def verify_propositions(
premise_evaluation: ProposerOutput,
) -> list[VerifiedProposition]:
async def create_verification_task(proposition: Proposition) -> VerifiedProposition:
return await client.chat.completions.create(
messages=[
{
"role": "system",
"content": """
Suppose you are one of the greatest AI
scientists, logicians, and mathematicians.
Let us think step by step. Please use
First-Order Logic (FOL) to determine
whether the deduction of two given
Premises to a Proposition is valid or not,
and reply with True or False.
""",
},
{
"role": "user",
"content": f"""
Premises:
{proposition.premise1}
{proposition.premise2}
Proposition:
{proposition.proposition}
""",
},
],
response_model=VerifiedProposition,
model="gpt-4o",
)
tasks = [
create_verification_task(proposition)
for proposition in premise_evaluation.valid_propositions
]
return await asyncio.gather(*tasks)
async def final_evaluation(
verification_result: list[str], hypothesis: str, premises: list[str]
) -> ReporterOutput:
formatted_premises = "\n- ".join(premises)
formatted_propositions = "\n- ".join(verification_result)
return await client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": """
Suppose you are one of the greatest AI
scientists, logicians, and mathematicians.
Let us think step by step. Read and analyze
the “Premises” first, then use First-Order
Logic (FOL) to judge whether the “Hypothesis”
is True, False, or Unknown. Please make sure
your reasoning is directly deduced from the
"Premises" and "Propositions" rather than
introducing unsourced common knowledge and
unsourced information by common sense
reasoning.
""",
},
{
"role": "user",
"content": f"""
Premises:
{formatted_premises}
Hypothesis: {hypothesis}
""",
},
{
"role": "assistant",
"content": f"""
Let's think step by step. From the premises,
we can deduce the following propositions:
{formatted_propositions}
Recall the Hypothesis: {hypothesis}
""",
},
],
response_model=ReporterOutput,
)
if __name__ == "__main__":
hypothesis = "Hyraxes lay eggs"
premises = [
"The only types of mammals that lay eggs are platypuses and echidnas",
"Platypuses are not hyrax",
"Echidnas are not hyrax",
"No mammals are invertebrates",
"All animals are either vertebrates or invertebrates",
"Mammals are animals",
"Hyraxes are mammals",
"Grebes lay eggs",
"Grebes are not platypuses and also not echidnas",
]
premise_evaluation = asyncio.run(generate_propositions(premises, hypothesis))
verification_result = asyncio.run(verify_propositions(premise_evaluation))
filtered_propositions = [
proposition.proposition
for proposition in verification_result
if proposition.is_valid
]
reporter_output = asyncio.run(
final_evaluation(filtered_propositions, hypothesis, premises)
)
print(reporter_output.model_dump_json(indent=2))
"""
{
"reasoning": "Based on the premises provided, the
only mammals that lay eggs are platypuses and
echidnas. Hyraxes are mammals but are explicitly
stated as not being platypuses or echidnas. Hence,
there is no basis in the premises to conclude that
hyraxes lay eggs. \n\nTherefore, the hypothesis that
hyraxes lay eggs is False.",
"is_valid_hypothesis": false
}
"""