text: segmenting objects

QUERY:
text: segmenting objects

CODE BLOCKS:
---
file_path=mnm/tool_api.py
lines=[413, 461]
type=FUNCTION
content={
def image_segmentation(image, ckpt=MODEL_SELECTION["image_segmentation"]):
    """
    It takes an image, segments it into different parts, and outputs segmentation masks of any shape for the parts.
    """
    import numpy as np
    import torch
    import transformers
    from PIL import Image

    img = image_processing(image)

    feature_extractor = transformers.MaskFormerFeatureExtractor.from_pretrained(ckpt)
    model = transformers.MaskFormerForInstanceSegmentation.from_pretrained(ckpt).to(
        DEVICE
    )
    model.eval()

    inputs = feature_extractor(images=img, return_tensors="pt")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    outputs = feature_extractor.post_process_panoptic_segmentation(outputs)[0]
    instance_map = outputs["segmentation"].cpu().numpy()
    objs = []
    for seg in outputs["segments_info"]:
        inst_id = seg["id"]
        label_id = seg["label_id"]
        category = model.config.id2label[label_id]
        mask = (instance_map == inst_id).astype(float)
        resized_mask = np.array(
            Image.fromarray(mask).resize(img.size, resample=Image.BILINEAR)
        )
        Y, X = np.where(resized_mask > 0.5)
        x1, x2 = np.min(X), np.max(X)
        y1, y2 = np.min(Y), np.max(Y)
        num_pixels = np.sum(mask)
        objs.append(
            dict(
                mask=resized_mask,
                label=category,
                bbox=[x1, y1, x2, y2],
                inst_id=inst_id,
            )
        )

    results_formatted = {"image": img, "objects": objs}

    return results_formatted
}
---
file_path=mnm/tool_api.py
lines=[491, 534]
type=FUNCTION
content={
def image_crop(image, object, ckpt=None):
    """
    It takes an image and 4 numbers representing the coordinates of a bounding box and crops the image to the region within the box.
    """

    def expand_box(box, img_size, factor=1.5):
        W, H = img_size
        x1, y1, x2, y2 = box
        dw = int(factor * (x2 - x1) / 2)
        dh = int(factor * (y2 - y1) / 2)
        cx = int((x1 + x2) / 2)
        cy = int((y1 + y2) / 2)
        x1 = max(0, cx - dw)
        x2 = min(cx + dw, W)
        y1 = max(0, cy - dh)
        y2 = min(cy + dh, H)
        return [x1, y1, x2, y2]

    img = image_processing(image)
    bbox = object["bbox"]

    if isinstance(bbox, str):
        try:
            bbox = ast.literal_eval(bbox)
        except:
            bbox = []

    # if len(objs) > 0:
    #     box = objs[0]["bbox"]  # only remove the first one in the box
    #     box = expand_box(box, img.size)
    #     out_img = img.crop(box)

    if len(bbox) == 4:
        use_percent = all(x < 1.0 for x in bbox)
        if use_percent:
            W, H = img.size
            bbox = [bbox[0] * W, bbox[1] * H, bbox[2] * W, bbox[3] * H]
        out_img = img.crop(bbox)
    else:
        out_img = img

    result_formatted = {"image": out_img}
    return result_formatted
}
---
file_path=mnm/tool_api.py
lines=[721, 729]
type=FUNCTION
content={
def count(objects, ckpt=None):
    """
    It takes a list of objects and returns the count of the objects.
    """
    objs = objects
    result_formatted = {"number": len(objs)}

    return result_formatted
}
---
(38 additional code blocks not shown. Search again with the same query to see additional results.)

Prototypes for the next 7 out of 35 classes/functions found in unexpanded results (search again with the same query to see details):
def select_object(objects, object_name, ckpt=None): ... # mnm/tool_api.py
def tag(image, objects, ckpt=None): ... # mnm/tool_api.py
def object_detection(image, ckpt=MODEL_SELECTION['object_detection']): ... # mnm/tool_api.py
def color_pop(image, object, ckpt=None): ... # mnm/tool_api.py
def emoji(image, object, emoji, ckpt=None): ... # mnm/tool_api.py
def background_blur(image, object, ckpt=None): ... # mnm/tool_api.py
def get_math_fact(number: str, ckpt=None): ... # mnm/tool_api.py

from mnm.tool_api import image_segmentation
from PIL import Image

# Load the image
image_path = '/root/codenav/assets/images/bbt.png'
image = Image.open(image_path)

# Segment the image
segmentation_results = image_segmentation(image)

# Filter out segments that are not people and sort them by bounding box width
people_segments = [seg for seg in segmentation_results['objects'] if seg['label'] == 'person']
people_segments_sorted = sorted(people_segments, key=lambda x: x['bbox'][2] - x['bbox'][0])

# Proceed to the next step with the sorted people segments

RESPONSE (Execution Result):
CODE EXECUTED WITHOUT ERROR, STDOUT WAS EMPTY
RELEVANT VARIABLES (only shown if string rep. has changed after code exec):
{"image_segmentation": <function image_segmentation at 0x7f1cbd89e9e0>, "Image": <module 'PIL.Image' from '/opt/conda/lib/python3.10/site-packages/PIL/Image.py'>, "image_path": /root/codenav/assets/images/bbt.png, "image": <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1034x798 at 0x7F1E1AA2D990>, "segmentation_results": {'image': <PIL.Image.Image image mode=RGB size=1034x798 at 0x7F1E1AA2EEC0>, 'objects': [{'mask': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), 'label': 'couch', 'bbox': [24, 511, 935, 797], 'inst_id': 1}, {'mask': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],..., "people_segments": [{'mask': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), 'label': 'person', 'bbox': [556, 84, 821, 797], 'inst_id': 2}, {'mask': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0...., "people_segments_sorted": [{'mask': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), 'label': 'person', 'bbox': [384, 100, 571, 486], 'inst_id': 7}, {'mask': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0...}

# Function to get the top-third of the bounding box height for the face
def get_face_bbox(person_bbox):
    x1, y1, x2, y2 = person_bbox
    face_height = (y2 - y1) // 3
    face_bbox = [x1, y1, x2, y1 + face_height]
    return face_bbox

# Get the face bounding box for each person
face_bboxes = [get_face_bbox(person['bbox']) for person in people_segments_sorted]

# Proceed to the next step with the face bounding boxes

RESPONSE (Execution Result):
CODE EXECUTED WITHOUT ERROR, STDOUT WAS EMPTY
RELEVANT VARIABLES (only shown if string rep. has changed after code exec):
{"get_face_bbox": <function get_face_bbox at 0x7f1cb0366320>, "face_bboxes": [[384, 100, 571, 228], [133, 311, 363, 473], [133, 52, 395, 184], [556, 84, 821, 321], [721, 144, 986, 282], [411, 283, 802, 454], [337, 291, 751, 459]]}

from mnm.tool_api import image_crop
import os

# Ensure the results directory exists
os.makedirs('/root/results/playground', exist_ok=True)

# Function to crop the face image and save it
def crop_and_save_face_images(face_bboxes, people_segments_sorted, image):
    face_images = []
    for i, face_bbox in enumerate(face_bboxes):
        # Crop the face image
        face_image = image_crop(image, {"bbox": face_bbox})['image']
        face_images.append(face_image)
        
        # Save the face image
        face_image_path = f'/root/results/playground/bbt_person_{i}.png'
        face_image.save(face_image_path)
    
    return face_images

# Crop and save the face images
face_images = crop_and_save_face_images(face_bboxes, people_segments_sorted, image)

RESPONSE (Execution Result):
CODE EXECUTED WITHOUT ERROR, STDOUT WAS EMPTY
RELEVANT VARIABLES (only shown if string rep. has changed after code exec):
{"image_crop": <function image_crop at 0x7f1cbd89eb00>, "os": <module 'os' from '/opt/conda/lib/python3.10/os.py'>, "crop_and_save_face_images": <function crop_and_save_face_images at 0x7f1c78c80550>, "face_images": [<PIL.Image.Image image mode=RGB size=187x128 at 0x7F1C77157F40>, <PIL.Image.Image image mode=RGB size=230x162 at 0x7F1C77157430>, <PIL.Image.Image image mode=RGB size=262x132 at 0x7F1C77157940>, <PIL.Image.Image image mode=RGB size=265x237 at 0x7F1C771576A0>, <PIL.Image.Image image mode=RGB size=265x138 at 0x7F1C771572E0>, <PIL.Image.Image image mode=RGB size=391x171 at 0x7F1C77157790>, <PIL.Image.Image image mode=RGB size=414x168 at 0x7F1C77157F70>]}

text: visual question answering

QUERY:
text: visual question answering

CODE BLOCKS:
---
file_path=mnm/tool_api.py
lines=[348, 375]
type=FUNCTION
content={
def visual_question_answering(
    image, question, ckpt=MODEL_SELECTION["visual_question_answering"]
):  # alternative: "dandelin/vilt-b32-finetuned-vqa"
    """
    It takes an image and a question about the image, and generates an answer to the question.
    """
    import torch
    from transformers import BlipForQuestionAnswering, BlipProcessor

    image = image_processing(image)
    question = text_processing(question)

    processor = BlipProcessor.from_pretrained(ckpt)
    model = BlipForQuestionAnswering.from_pretrained(
        ckpt, torch_dtype=torch.float16
    ).to(DEVICE)

    raw_image = image

    inputs = (
        processor(raw_image, question, return_tensors="pt").to(torch.float16).to(DEVICE)
    )

    out = model.generate(**inputs)
    result_formatted = {"text": processor.decode(out[0], skip_special_tokens=True)}

    return result_formatted
}
---
file_path=mnm/tool_api.py
lines=[178, 193]
type=FUNCTION
content={
def question_answering(
    question: str, text: str, ckpt=MODEL_SELECTION["question_answering"]
):  # alternative: "deepset/roberta-base-squad2"
    """
    It takes a text and a question, and outputs an answer to that question based on the text.
    """
    question = text_processing(question)
    text = text_processing(text)
    pipe = pipeline("question-answering", model=ckpt)
    result_raw = pipe(
        question=question, context=text
    )  # {'score': 0.01082150824368, 'start': 0, 'end': 10, 'answer': 'My name is'}
    result_formatted = {"text": result_raw["answer"]}

    return result_formatted
}
---
file_path=mnm/tool_api.py
lines=[879, 893]
type=FUNCTION
content={
def get_math_fact(number: str, ckpt=None):
    """
    It provides interesting math facts about numbers.
    """
    url = f"https://numbersapi.p.rapidapi.com/{number}/math"
    params = {"fragment": "true", "json": "true"}
    headers = {
        "X-RapidAPI-Key": RAPID_API_KEY,
        "X-RapidAPI-Host": "numbersapi.p.rapidapi.com",
    }
    response = requests.get(url, headers=headers, params=params)
    result_formatted = response.json()

    return result_formatted
}
---
(38 additional code blocks not shown. Search again with the same query to see additional results.)

Prototypes for the next 7 out of 33 classes/functions found in unexpanded results (search again with the same query to see details):
def get_trivia_fact(number: str, ckpt=None): ... # mnm/tool_api.py
def automatic_speech_recognition(audio: str, ckpt=MODEL_SELECTION['automatic_speech_recognition']): ... # mnm/tool_api.py
def get_date_fact(date: str, ckpt=None): ... # mnm/tool_api.py
def get_year_fact(year: str, ckpt=None): ... # mnm/tool_api.py
def get_weather(lon: str, lat: str, ckpt=None): ... # mnm/tool_api.py
def love_calculator(first_name: str, second_name: str, ckpt=None): ... # mnm/tool_api.py
def wikipedia_simple_search(text: str, ckpt=None): ... # mnm/tool_api.py

from mnm.tool_api import visual_question_answering

# Function to check attributes using visual question answering
def check_attributes(face_images):
    attributes_results = []
    for i, face_image in enumerate(face_images):
        # Check if the person is wearing glasses
        wearing_glasses = visual_question_answering(face_image, "Is this person wearing glasses?")['text']
        # Check if the person is talking on a cell phone
        talking_on_phone = visual_question_answering(face_image, "Is this person talking on a cell phone?")['text']
        
        attributes_results.append({
            "wearing_glasses": wearing_glasses,
            "talking_on_phone": talking_on_phone,
            "face_image_path": f'bbt_person_{i}.png'
        })
    
    return attributes_results

# Check attributes for each cropped face image
attributes_results = check_attributes(face_images)

RESPONSE (Execution Result):
CODE EXECUTED WITHOUT ERROR, STDOUT WAS EMPTY
RELEVANT VARIABLES (only shown if string rep. has changed after code exec):
{"visual_question_answering": <function visual_question_answering at 0x7f1cbd89e8c0>, "check_attributes": <function check_attributes at 0x7f1c77509bd0>, "attributes_results": [{'wearing_glasses': 'yes', 'talking_on_phone': 'yes', 'face_image_path': 'bbt_person_0.png'}, {'wearing_glasses': 'no', 'talking_on_phone': 'no', 'face_image_path': 'bbt_person_1.png'}, {'wearing_glasses': 'no', 'talking_on_phone': 'no', 'face_image_path': 'bbt_person_2.png'}, {'wearing_glasses': 'yes', 'talking_on_phone': 'no', 'face_image_path': 'bbt_person_3.png'}, {'wearing_glasses': 'yes', 'talking_on_phone': 'no', 'face_image_path': 'bbt_person_4.png'}, {'wearing_glasses': 'yes', 'talking...}

# Function to create an HTML page with the results
def create_html_page(attributes_results):
    html_content = """
    <html>
    <head>
    <style>
    table, th, td {
      border: 1px solid black;
      border-collapse: collapse;
    }
    th, td {
      padding: 5px;
      text-align: left;
    }
    </style>
    </head>
    <body>
    <table>
      <tr>
        <th>Face Image</th>
        <th>Wearing Glasses</th>
        <th>Talking on Phone</th>
      </tr>
    """
    
    for result in attributes_results:
        html_content += f"""
        <tr>
          <td><img src="{result['face_image_path']}" alt="Face image"></td>
          <td>{result['wearing_glasses']}</td>
          <td>{result['talking_on_phone']}</td>
        </tr>
        """
    
    html_content += """
    </table>
    </body>
    </html>
    """
    
    return html_content

# Create the HTML page and save it
html_page_content = create_html_page(attributes_results)
html_file_path = '/root/results/playground/bbt.html'
with open(html_file_path, 'w') as html_file:
    html_file.write(html_page_content)

RESPONSE (Execution Result):
CODE EXECUTED WITHOUT ERROR, STDOUT WAS EMPTY
RELEVANT VARIABLES (only shown if string rep. has changed after code exec):
{"create_html_page": <function create_html_page at 0x7f1c6fd325f0>, "html_page_content": 
    <html>
    <head>
    <style>
    table, th, td {
      border: 1px solid black;
      border-collapse: collapse;
    }
    th, td {
      padding: 5px;
      text-align: left;
    }
    </style>
    </head>
    <body>
    <table>
      <tr>
        <th>Face Image</th>
        <th>Wearing Glasses</th>
        <th>Talking on Phone</th>
      </tr>
    
        <tr>
          <td><img src="bbt_person_0.png" alt="Face image"></td>
          <td>yes</td>
          <td>yes</td>
        </tr>
   ..., "html_file_path": /root/results/playground/bbt.html, "html_file": <_io.TextIOWrapper name='/root/results/playground/bbt.html' mode='w' encoding='UTF-8'>}

# Select the person segment where all attributes are present
selected_person_segment = None
for i, result in enumerate(attributes_results):
    if result['wearing_glasses'].lower() == 'yes' and result['talking_on_phone'].lower() == 'yes':
        selected_person_segment = people_segments_sorted[i]
        break

# Check if we found a person with all attributes
if selected_person_segment is not None:
    # Proceed to apply color pop effect
    pass
else:
    raise ValueError("No person with all attributes found.")

RESPONSE (Execution Result):
CODE EXECUTED WITHOUT ERROR, STDOUT WAS EMPTY
RELEVANT VARIABLES (only shown if string rep. has changed after code exec):
{"selected_person_segment": {'mask': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), 'label': 'person', 'bbox': [384, 100, 571, 486], 'inst_id': 7}, "i": 0, "result": {'wearing_glasses': 'yes', 'talking_on_phone': 'yes', 'face_image_path': 'bbt_person_0.png'}}

text: color pop

QUERY:
text: color pop

CODE BLOCKS:
---
file_path=mnm/tool_api.py
lines=[678, 719]
type=FUNCTION
content={
def color_pop(image, object, ckpt=None):
    """
    It takes an image and one or multiple objects, and returns an image where only the object is colored and the rest is black and white.
    """
    import cv2
    import numpy as np
    from PIL import Image

    def refine_mask(img, mask):
        bgdModel = np.zeros((1, 65), np.float64)
        fgdModel = np.zeros((1, 65), np.float64)
        mask, _, _ = cv2.grabCut(
            img.astype(np.uint8),
            mask.astype(np.uint8),
            None,
            bgdModel,
            fgdModel,
            5,
            cv2.GC_INIT_WITH_MASK,
        )
        return mask.astype(float)

    obj = object
    img = image_processing(image)

    gimg = img.copy()
    gimg = gimg.convert("L").convert("RGB")
    gimg = np.array(gimg).astype(float)
    img = np.array(img).astype(float)
    # for obj in objs:
    if "mask" in obj:
        # make a color pop if an object mask is provided, else return a black and white image
        refined_mask = refine_mask(img, obj["mask"])
        mask = np.tile(refined_mask[:, :, np.newaxis], (1, 1, 3))
        gimg = mask * img + (1 - mask) * gimg

    gimg = np.array(gimg).astype(np.uint8)
    gimg = Image.fromarray(gimg)

    result_formatted = {"image": gimg}
    return result_formatted
}
---
file_path=mnm/tool_api.py
lines=[1013, 1038]
type=CONDITIONAL
content={
if __name__ == "__main__":
    # res = wikipedia_simple_search({"text": "what is a recession"})
    res = text_summarization(
        {
            "text": "John Taylor, who had supported her through college, was interested in cotton."
        }
    )
    # get_weather(0, {"lon": -122.330062, "lat": 47.6038321})
    # get_location(0, {"city": "Seattle"})
    # for movie_title, movie_year in [("Argo", 2012), ("12 Years a Slave", 2013), ("Birdman or (The Unexpected Virtue of Ignorance)", 2014), ("Spotlight", 2015), ("Moonlight",2016), ("The Shape of Water",	2017), ("Green Book",	2018), ("Parasite",	2019), ("Nomadland",	2020), ("CODA",	2021)]:
    #     res = search_movie({"movie_title": movie_title, "movie_year": movie_year})
    # get_location(0, {"city": "Seattle"})
    # love_calculator(0, {'first_name': 'Alice', 'second_name': 'Bob'})
    # get_math_fact(0, {"number": "22"})
    # res = get_year_fact({"year": "2024"})
    # get_date_fact(0, {"date": "1/11"})
    # background blur
    # [{'id': 0, 'name': 'object detection', 'args': {'image': '2387947.jpg'}}, {'id': 1, 'name': 'select object', 'args': {'objects': '<node-0>.objects', 'object_name': 'the small white bird'}}, {'id': 2, 'name': 'background blur', 'args': {'image': '2387947.jpg', 'object': '<node-1>.object'}}]
    # emoji
    # [{'id': 0, 'name': 'object detection', 'args': {'image': '2382602.jpg'}}, {'id': 1, 'name': 'select object', 'args': {'objects': '<node-0>.objects', 'object_name': 'small red plane'}}, {'id': 2, 'name': 'emoji', 'args': {'image': '2382602.jpg', 'object': '<node-1>.object', 'emoji': 'alien'}}]
    # [{'id': 0, 'name': 'object detection', 'args': {'image': '2322211.jpg'}}, {'id': 1, 'name': 'select object', 'args': {'objects': '<node-0>.objects', 'object_name': 'umbrella'}}, {'id': 2, 'name': 'emoji', 'args': {'image': '2322211.jpg', 'object': '<node-1>.object', 'emoji': 'goblin'}}]
    # [{'id': 0, 'name': 'image segmentation', 'args': {'image': '2350119.jpg'}}, {'id': 1, 'name': 'select object', 'args': {'objects': '<node-0>.objects', 'object_name': 'black horse'}}, {'id': 2, 'name': 'emoji', 'args': {'image': '2350119.jpg', 'object': '<node-1>.object', 'emoji': 'unicorn emoji'}}]
    print(res, "\n")

    pass
}
---
(All code blocks matching the query were returned.)

from mnm.tool_api import color_pop

# Apply color pop effect to the selected person segment
color_pop_result = color_pop(image, selected_person_segment)

# Save the result image
color_pop_image_path = '/root/results/playground/bbt_color_pop.png'
color_pop_result['image'].save(color_pop_image_path)

RESPONSE (Execution Result):
CODE EXECUTED WITHOUT ERROR, STDOUT WAS EMPTY
RELEVANT VARIABLES (only shown if string rep. has changed after code exec):
{"color_pop": <function color_pop at 0x7f1cbd89ee60>, "color_pop_result": {'image': <PIL.Image.Image image mode=RGB size=1034x798 at 0x7F1915C137F0>}, "color_pop_image_path": /root/results/playground/bbt_color_pop.png}

User Query ¶

Interaction 1: ↔ ¶

Interaction 2: ↔ ¶

Interaction 3: ↔ ¶

Interaction 4: ↔ ¶

Interaction 5: ↔ ¶

Interaction 6: ↔ ¶

Interaction 7: ↔ ¶

Interaction 8: ↔ ¶

Interaction 9: ↔ ¶

Interaction 10: ↔ ¶

Interaction 11: Done¶