[Misc] IO Processor plugins for pooling models (#22820)

Signed-off-by: Christian Pinto <christian.pinto@ibm.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
2025-09-01 07:07:12 +01:00
parent 437c3ce026
commit 1cb39dbcdd
25 changed files with 1183 additions and 43 deletions
--- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
+++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import base64
+import os
+
+import torch
+
+from vllm import LLM
+from vllm.pooling_params import PoolingParams
+
+# This example shows how to perform an offline inference that generates
+# multimodal data. In this specific case this example will take a geotiff
+# image as input, process it using the multimodal data processor, and
+# perform inference.
+# Reuirement - install plugin at:
+#   https://github.com/christian-pinto/prithvi_io_processor_plugin
+
+
+def main():
+    torch.set_default_dtype(torch.float16)
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+
+    img_prompt = dict(
+        data=image_url,
+        data_format="url",
+        image_format="tiff",
+        out_data_format="b64_json",
+    )
+
+    llm = LLM(
+        model="christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+        skip_tokenizer_init=True,
+        trust_remote_code=True,
+        enforce_eager=True,
+        # Limit the maximum number of parallel requests
+        # to avoid the model going OOM.
+        # The maximum number depends on the available GPU memory
+        max_num_seqs=32,
+        io_processor_plugin="prithvi_to_tiff_india",
+    )
+
+    pooling_params = PoolingParams(task="encode", softmax=False)
+    pooler_output = llm.encode(
+        img_prompt,
+        pooling_params=pooling_params,
+    )
+    output = pooler_output[0].outputs
+
+    print(output)
+    decoded_data = base64.b64decode(output.data)
+
+    file_path = os.path.join(os.getcwd(), "offline_prediction.tiff")
+    with open(file_path, "wb") as f:
+        f.write(decoded_data)
+
+    print(f"Output file path: {file_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/prithvi_geospatial_mae.py
+++ b/examples/online_serving/prithvi_geospatial_mae.py
@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import os
+
+import requests
+
+# This example shows how to perform an online inference that generates
+# multimodal data. In this specific case this example will take a geotiff
+# image as input, process it using the multimodal data processor, and
+# perform inference.
+# Reuirements :
+# - install plugin at:
+#   https://github.com/christian-pinto/prithvi_io_processor_plugin
+# - start vllm in serving mode with the below args
+#   --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
+#   --task embed --trust-remote-code
+#   --skip-tokenizer-init --enforce-eager
+#   --io-processor-plugin prithvi_to_tiff_india
+
+
+def main():
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+    server_endpoint = "http://localhost:8000/pooling"
+
+    request_payload_url = {
+        "data": {
+            "data": image_url,
+            "data_format": "url",
+            "image_format": "tiff",
+            "out_data_format": "b64_json",
+        },
+        "priority": 0,
+        "model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+    }
+
+    ret = requests.post(server_endpoint, json=request_payload_url)
+
+    print(f"response.status_code: {ret.status_code}")
+    print(f"response.reason:{ret.reason}")
+
+    response = ret.json()
+
+    decoded_image = base64.b64decode(response["data"]["data"])
+
+    out_path = os.path.join(os.getcwd(), "online_prediction.tiff")
+
+    with open(out_path, "wb") as f:
+        f.write(decoded_image)
+
+
+if __name__ == "__main__":
+    main()