-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathllm.py
361 lines (326 loc) · 15.1 KB
/
llm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import uuid
import json
from preprocess import extract_text_from_pdf, split_into_sentences, create_overlapping_chunks
import PyPDF2
building_system = [
"Wall",
"Balcony",
"Pod",
"Frame",
"Facade",
"Floors",
"Modules",
"Plants",
"Roofs",
"Stairs"
]
product_category = [
"Boarding",
"Solid Wall Panels",
"Closed Wall Panels",
"Twinwall",
"Open Wall Panels",
"Structural Insulated Panels (SIPs)",
"Insulated Concrete Panels",
"Prefabricated Balcony",
"Pod",
"Whole Building System",
"Structural Frame",
"Facade System",
"Hollowcore Floor",
"Concrete Lattice Floor",
"Floor Cassettes",
"Solid Floor Panels",
"Volumetric module",
"Prefabricated Plant",
"Roof Panel",
"Roof Truss",
"Prefabricated Stairs"
]
material = [
"Timber",
"Steel",
"Concrete",
"Hybrid",
"Other"
]
load_dotenv()
def define_model():
llm = ChatOpenAI(temperature=0)
return llm
class SchemaOutput(BaseModel):
id: str
name: str
description: str
boundingBox: dict
buildingSystem: str
productCategory: str
material: dict
dimensional: dict
def map_llm_response_to_updated_schema(llm_response: dict) -> dict:
"""Map the LLM response to the updated schema format."""
try:
# Generate required fields
element_id = generate_uuid()
name = llm_response.get("element", "Unknown Element")
description = llm_response.get("explanation", "No description provided.")
# Populate boundingBox with values (keeping units)
bounding_box = {
"width": llm_response.get("width", 0),
"height": llm_response.get("height", 0),
"length": llm_response.get("length", 0)
}
dimensional = {
"width": {"min":llm_response.get("min_width", 0),
"max":llm_response.get("max_width", 0)},
"height": {"min":llm_response.get("min_height", 0),
"max":llm_response.get("max_height", 0)},
"length": {"min":llm_response.get("min_heigt", 0),
"max":llm_response.get("max_height", 0)},
}
material = {
"finishMaterial": llm_response.get("material", "Other"),
"structuralMaterial": llm_response.get("material", "Other"),
}
structural_properties = {
"loadBearingCapacity": {
"maximumLoad": llm_response.get("maximum_load", 0),
"unit": llm_response.get("load_unit", "None"),
"loadDistribution": llm_response.get("load_distribution", "None"),
}
}
performance = {
"resistanceToFireClassification": llm_response.get("resistance_to_fire_classification", None),
"thermalTransmittance": llm_response.get("thermal_transmittance", 0),
"acousticProperties": {
"soundInsulationRating": llm_response.get("sound_insulation_rating", None),
"acousticPerformance": llm_response.get("acoustic_performance", None),
}
}
sustainability = {
"countryOfManufacturing": llm_response.get("country_of_manufacturing", None),
"classification": llm_response.get("sustainability_classification", None),
}
documentation_and_compliance = {
"technicalSpecifications": llm_response.get("technical_specifications", None),
"certifications": {
"certificationTypes": llm_response.get("certification_types", []),
"regulatoryApprovals": llm_response.get("regulatory_approvals", []),
}
}
lifecycle_and_maintenance = {
"expectedLifespan": llm_response.get("expected_lifespan", None),
"maintenanceRequirements": llm_response.get("maintenance_requirements", None),
"warranty": llm_response.get("warranty", None),
}
schema_element = {
"id": element_id,
"name": name,
"description": description,
"boundingBox": {
"width": bounding_box["width"],
"height": bounding_box["height"],
"depth": bounding_box["length"], # Assuming depth = length
},
"images": [], # Placeholder for image links
"buildingSystem": llm_response.get("building_system", "Unknown"),
"productCategory": llm_response.get("product_category", "Unknown"),
"material": material,
"dimensional": dimensional,
"structuralProperties": structural_properties,
"performance": performance,
"sustainability": sustainability,
"documentationAndCompliance": documentation_and_compliance,
"lifecycleAndMaintenance": lifecycle_and_maintenance,
}
return schema_element
except Exception as e:
print(f"Error mapping LLM response to schema: {e}")
return None
def generate_uuid():
"""Generate a unique UUID."""
return str(uuid.uuid4())
class ChooseType(BaseModel):
element: str = Field(description="What is the name of idetified entity.")
building_system: str = Field(description="What kind of building system category the element belongs to.")
product_category: str = Field(description="What kind of product category the element belongs to.")
explanation: str = Field(description="The reason why you chose this specific product category.")
material: str = Field(description="What kind of material the element is built from?")
height: int = Field(description="What is the height of the element, as an integer value")
width: int = Field(description="What is the width of the element?")
length: int = Field(description="What is the length of the element?")
maximum_height: int = Field(description="What is the maximum height of the element, as an integer value")
minimum_height: int = Field(description="What is the minimum height of the element, as an integer value")
max_width: int = Field(description="What is the maximum width of the element, as an integer value")
min_width: int = Field(description="What is the minimum width of the element, as an integer value")
max_length: int = Field(description="What is the maximum length of the element, as an integer value")
min_length: int = Field(description="What is the minimum length of the element, as an integer value")
# New fields
maximum_load: int = Field(description="What is the maximum load-bearing capacity of the element, as an integer value")
load_unit: str = Field(description="What is the unit of the load-bearing capacity, as unit in string")
load_distribution: str = Field(description="Describe the load distribution of the element.")
resistance_to_fire_classification: str = Field(description="What is the resistance to fire classification of the element?")
thermal_transmittance: int = Field(description="What is the thermal transmittance of the element as an integer value?")
sound_insulation_rating: str = Field(description="What is the sound insulation rating of the element?")
acoustic_performance: str = Field(description="Describe the acoustic performance of the element.")
country_of_manufacturing: str = Field(description="Which country is the element manufactured in?")
sustainability_classification: str = Field(description="What is the sustainability classification of the element?")
technical_specifications: str = Field(description="Provide the technical specifications of the element.")
certification_types: str = Field(description="List the certification types of the element.")
regulatory_approvals: str = Field(description="List the regulatory approvals of the element.")
expected_lifespan: str = Field(description="What is the expected lifespan of the element?")
maintenance_requirements: str = Field(description="What are the maintenance requirements of the element?")
warranty: str = Field(description="What is the warranty period of the element?")
def llm_stack(model, chunk):
"""Process a single chunk with the LLM and map the response."""
building_system = [
"Wall", "Balcony", "Pod", "Frame", "Facade", "Floors", "Modules", "Plants", "Roofs", "Stairs"
]
product_category = [
"Boarding", "Solid Wall Panels", "Closed Wall Panels", "Twinwall",
"Open Wall Panels", "Structural Insulated Panels (SIPs)", "Insulated Concrete Panels",
"Prefabricated Balcony", "Pod", "Whole Building System", "Structural Frame", "Facade System",
"Hollowcore Floor", "Concrete Lattice Floor", "Floor Cassettes", "Solid Floor Panels",
"Volumetric module", "Prefabricated Plant", "Roof Panel", "Roof Truss", "Prefabricated Stairs"
]
material = ["Timber", "Steel", "Concrete", "Hybrid", "Other"]
parser = JsonOutputParser(pydantic_object=ChooseType)
# template = """
# You are an assistant that identifies elements related to the building domain. For each element:
# - Extract metadata and match it to predefined categories.
# - Fill missing data with 'None'.
# - Please consider synonyms of the words, such as thickness for the width.
# Features to extract:
# - Building system: {building_system}
# - Product category: {product_category}
# - Materials: {material}
# - Maximum and minimum dimensions (height, width, length)
# - Load bearing capacity (maximum load, unit, load distribution)
# - Resistance to fire classification
# - Thermal transmittance
# - Acoustic properties (sound insulation rating, acoustic performance)
# - Country of manufacturing
# - Sustainability classification
# - Technical specifications
# - Certifications (certification types, regulatory approvals)
# - Expected lifespan
# - Maintenance requirements
# - Warranty
# Given text: {chunk}
# Output format: {format_instructions}
# """
template = """
You are an assistant that reads the given text and identifies elements related to the building domain. Here is a sample workflow that you can follow:
1. Identify the elements described in the text.
2. Extract metadata about the element, including dimensions, material, and other properties. Use synonyms for interpretation (e.g., "width" could be "thickness" or "breadth").
3. If some features are indirectly mentioned or implied, infer the values if possible. For example, if "thick concrete walls" is stated, infer a material of "Concrete" and a general width range.
4. For features that are not mentioned, type "None" for missing data.
Every numerical units must be in milimeters, and just output numerical values as an integer without the unit.
Features to extract:
- Element Name
- Building system: One of the following categories: {building_system}
- Product category: One of the following categories: {product_category}
- Material: One of the following: {material}
- Dimensions:
- Maximum height, minimum height
- Maximum width, minimum width
- Maximum length, minimum length
- Structural properties (if mentioned):
- Maximum load-bearing capacity (maximum load, unit, load distribution)
- Performance features:
- Resistance to fire classification
- Thermal transmittance
- Acoustic properties (sound insulation rating, acoustic performance)
- Sustainability features:
- Country of manufacturing
- Sustainability classification
- Documentation:
- Technical specifications
- Certification types
- Regulatory approvals
- Lifecycle and maintenance:
- Expected lifespan
- Maintenance requirements
- Warranty
Please use the exact format below for your response. Do not skip any field, and use "None" for missing data.
Example input:
"The wall system consists of solid wall panels made of concrete, with a thickness between 18 cm and 40 cm, and a height up to 9 meters."
Example output:
{{
"element": "Wall System",
"building_system": "Wall",
"product_category": "Solid Wall Panels",
"explanation": "The text mentions solid wall panels made of concrete with specific dimensions.",
"material": "Concrete",
"height": 9000,
"width": 1234,
"length": 500,
"maximum height": 1000,
"minimum height": 10,
"maximum width": 200,
"minimum width": 10,
"maximum length": 30,
"minimum length": 10,
"maximum load": 500,
"load unit": "kn",
"load distribution": "None",
"resistance to fire classification": "None",
"thermal transmittance": "None",
"sound insulation rating": "None",
"acoustic performance": "None",
"country of manufacturing": "None",
"sustainability classification": "None",
"technical specifications": "None",
"certification types": [],
"regulatory approvals": [],
"expected lifespan": "None",
"maintenance requirements": "None",
"warranty": "None"
}}
- Given text: {chunk}
- Output format: {format_instructions}
"""
prompt = PromptTemplate(
template=template,
input_variables=["building_system", "product_category", "material", "chunk"],
partial_variables={"format_instructions": parser.get_format_instructions()},
)
chain = prompt | model | parser
result = chain.invoke({
"building_system": building_system,
"product_category": product_category,
"material": material,
"chunk": chunk,
})
return map_llm_response_to_updated_schema(result)
if __name__ == "__main__":
file_name= "bele_doppelwand_2015"
pdf_file_path = f"Data/PDFs/{file_name}.pdf"
pdf_text = extract_text_from_pdf(pdf_file_path)
sentences_list = split_into_sentences(pdf_text)
chunks = create_overlapping_chunks(sentences_list)
model = define_model()
schema_outputs = []
for index, chunk in enumerate(chunks):
print(f"Processing chunk {index + 1}/{len(chunks)}...")
llm_response = llm_stack(model, chunk)
print("LLM Response: ", llm_response)
if llm_response:
print("The output after mapping ... ")
schema_outputs.append(llm_response)
print("Final Schema Outputs:")
print(schema_outputs)
try:
import os
os.makedirs("result", exist_ok=True)
# Write JSON to file
with open(f"result/{file_name}.json", "w") as file:
json.dump(schema_outputs, file, indent=4)
except Exception as e:
print("The error:", e)
##