Updated

fexploit · fexploit · commit 9931bb1560ae · 2024-05-25T22:54:53.000+03:00
diff --git a/README.md b/README.md
@@ -1,2 +1,60 @@
 # ComfyUI-AutoLabel
-Generate detailed descriptions of the main object in an image.
+
+ComfyUI-AutoLabel is a custom node for [ComfyUI](https://github.com/comfyanonymous/ComfyUI) that uses BLIP (Bootstrapping Language-Image Pre-training) to generate detailed descriptions of the main object in an image. This node leverages the power of BLIP to provide accurate and context-aware captions for images.
+
+![ComfyUI-AutoLabel](demo.png)
+
+## Features
+
+- **Image to Text Description**: Generate detailed descriptions of the main object in an image.
+- **Customizable Prompts**: Provide your own prompt to guide the description generation.
+- **Flexible Inference Modes**: Supports GPU, GPU with float16, and CPU inference modes.
+- **Offline Mode**: Option to download and use models offline.
+
+## Installation
+
+1. **Clone the Repository**: Clone this repository into your `custom_nodes` folder in ComfyUI.
+
+    ```bash
+    git clone https://github.com/fexploit/ComfyUI-AutoLabel custom_nodes/ComfyUI-AutoLabel
+    ```
+
+2. **Install Dependencies**: Navigate to the cloned folder and install the required dependencies.
+
+    ```bash
+    cd custom_nodes/ComfyUI-AutoLabel
+    pip install -r requirements.txt
+    ```
+
+## Usage
+
+### Adding the Node
+
+1. Start ComfyUI.
+2. Add the `AutoLabel` node from the custom nodes list.
+3. Connect an image input and configure the parameters as needed.
+
+### Parameters
+
+- `image` (required): The input image tensor.
+- `prompt` (optional): A string to guide the description generation (default: "a photography of").
+- `repo_id` (optional): The Hugging Face model repository ID (default: "Salesforce/blip-image-captioning-base").
+- `inference_mode` (optional): The inference mode, can be "gpu_float16", "gpu", or "cpu" (default: "gpu").
+- `get_model_online` (optional): Boolean flag to download the model online if not already present (default: True).
+
+## Contributing
+
+Contributions are welcome! Please open an issue or submit a pull request with your changes.
+
+## License
+
+This project is licensed under the MIT License.
+
+## Acknowledgements
+
+- [ComfyUI](https://github.com/comfyanonymous/ComfyUI)
+- [BLIP](https://huggingface.co/Salesforce/blip-image-captioning-base)
+
+## Contact
+
+For any inquiries, please open an issue on the [GitHub repository](https://github.com/fexploit/ComfyUI-AutoLabel).
diff --git a/__init__.py b/__init__.py
@@ -0,0 +1,9 @@
+from .autolabel import AutoLabel
+
+NODE_CLASS_MAPPINGS = {
+    "AutoLabel": AutoLabel
+}
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "AutoLabel": "Auto Label"
+}
diff --git a/autolabel.py b/autolabel.py
@@ -0,0 +1,71 @@
+import os
+import torch
+from PIL import Image
+from transformers import BlipProcessor, BlipForConditionalGeneration
+
+class AutoLabel:
+    def __init__(self):
+        pass
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "image": ("IMAGE",),
+                "prompt": ("STRING", {"default": "a photography of"}),
+                "repo_id": ("STRING", {"default": "Salesforce/blip-image-captioning-base"}),
+                "inference_mode": (["gpu_float16", "gpu", "cpu"],),
+                "get_model_online": ("BOOLEAN", {"default": True},)
+            }
+        }
+
+    RETURN_TYPES = ("STRING",)
+    RETURN_NAMES = ("main_object_description",)
+    FUNCTION = "generate_caption"
+    CATEGORY = "AutoLabel"
+
+    def tensor_to_image(self, tensor):
+        tensor = tensor.cpu()
+        image_np = tensor.squeeze().mul(255).clamp(0, 255).byte().numpy()
+        image = Image.fromarray(image_np, mode='RGB')
+        return image
+
+    def generate_caption(self, image, prompt, repo_id, inference_mode, get_model_online):
+        if image is None:
+            raise ValueError("Need an image")
+        if not repo_id:
+            raise ValueError("Need a repo_id or local_model_path")
+
+        if not get_model_online:
+            os.environ['TRANSFORMERS_OFFLINE'] = "1"
+
+        processor = BlipProcessor.from_pretrained(repo_id)
+
+        pil_image = self.tensor_to_image(image)
+
+        try:
+            if inference_mode == "gpu_float16":
+                model = BlipForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.float16).to("cuda")
+                inputs = processor(pil_image, prompt, return_tensors="pt").to("cuda", torch.float16)
+            elif inference_mode == "gpu":
+                model = BlipForConditionalGeneration.from_pretrained(repo_id).to("cuda")
+                inputs = processor(pil_image, prompt, return_tensors="pt").to("cuda")
+            else:
+                model = BlipForConditionalGeneration.from_pretrained(repo_id)
+                inputs = processor(pil_image, prompt, return_tensors="pt")
+
+            out = model.generate(**inputs)
+            description = processor.decode(out[0], skip_special_tokens=True)
+            return (description,)
+
+        except Exception as e:
+            print(e)
+            return ("Error occurred during caption generation",)
+
+NODE_CLASS_MAPPINGS = {
+    "AutoLabel": AutoLabel
+}
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "AutoLabel": "Auto Label"
+}
diff --git a/demo.png b/demo.png