Advanced Image Preprocessing & 15-Step Augmentation Pipeline for Agriculture Crop Disease Datasets
Working with agricultural images—especially fruit and leaf disease datasets—comes with its own set of challenges.
Lighting variations, inconsistent contrast, noise, and different camera positions can easily confuse a machine-learning model.
To handle this, here is a complete preprocessing and augmentation workflow designed for agriculture datasets such as apples, mangoes, grapes, wheat leaves, rice leaves, and more.
This pipeline generates 15 automatic transformations per image, making your dataset much richer and far more suitable for training deep-learning models like CNNs, Vision Transformers, and hybrid SVM-ViT models.
1. Install the Required Python Packages
We install only the necessary tools such as OpenCV, Pillow, and tqdm. These handle image processing, resizing, and progress visualization.
# 📦 Install Required Packages
!pip install opencv-python-headless Pillow tqdm
2. Importing Libraries
Next, we import OpenCV, NumPy, PIL, and Google Drive utilities.
These are essential for reading images, preprocessing them, and saving the outputs.
# 📚 Import Libraries
import os
import cv2
import numpy as np
import shutil
from tqdm import tqdm
from PIL import Image, ImageOps
from google.colab import drive
3. Mount Google Drive
Most users keep datasets inside Drive, so we mount it here for easy access.
# 🔗 Mount Google Drive
drive.mount('/content/drive')
4. Define Input and Output Paths
You can replace the folder names based on your own dataset directory.
The script creates a clean output folder to store all processed images.
# 📁 Define Paths
input_path = '/content/drive/MyDrive/Apple_Fruit_Dataset'
output_path = '/content/processed_apple_dataset'
os.makedirs(output_path, exist_ok=True)
We use a standard 256×256 size because it works well with CNN-based models.
TARGET_SIZE = (256, 256)
5. The 15-Step Preprocessing & Augmentation Function
This function generates 15 enhanced versions of every original image.
These include grayscale, CLAHE, blurring, edge detection, thresholding, color jitter, gamma correction, rotation, flipping, and more.
This diversity helps the model learn real-world variations in crops and diseases.
# 🔧 Define 15 Preprocessing Functions
def advanced_preprocess_15(img):
processed = []
# 1. Original resized
processed.append(img)
# 2. Grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
processed.append(cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR))
# 3. Black & White Threshold
_, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)
processed.append(cv2.cvtColor(bw, cv2.COLOR_GRAY2BGR))
# 4. CLAHE (Contrast Enhancement)
lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
l, a, b = cv2.split(lab)
cl = cv2.createCLAHE(clipLimit=2.0).apply(l)
limg = cv2.merge((cl, a, b))
processed.append(cv2.cvtColor(limg, cv2.COLOR_LAB2BGR))
# 5. Gaussian Blur
processed.append(cv2.GaussianBlur(img, (5, 5), 0))
# 6. Median Blur
processed.append(cv2.medianBlur(img, 5))
# 7. Bilateral Filter
processed.append(cv2.bilateralFilter(img, 9, 75, 75))
# 8. Adaptive Mean Threshold
th_mean = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY, 11, 2)
processed.append(cv2.cvtColor(th_mean, cv2.COLOR_GRAY2BGR))
# 9. Adaptive Gaussian Threshold
th_gauss = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2)
processed.append(cv2.cvtColor(th_gauss, cv2.COLOR_GRAY2BGR))
# 10. Canny Edge Detection
canny = cv2.Canny(gray, 100, 200)
processed.append(cv2.cvtColor(canny, cv2.COLOR_GRAY2BGR))
# 11. Sobel Edge Detection
sobelx = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=5)
sobely = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=5)
sobel = cv2.magnitude(sobelx, sobely)
sobel = np.uint8(np.clip(sobel, 0, 255))
processed.append(cv2.cvtColor(sobel, cv2.COLOR_GRAY2BGR))
# 12. Gamma Correction
gamma = 1.5
invGamma = 1.0 / gamma
table = np.array([((i / 255.0) ** invGamma) * 255
for i in np.arange(256)]).astype("uint8")
gamma_corrected = cv2.LUT(img, table)
processed.append(gamma_corrected)
# 13. HSV Jitter
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
h_, s_, v_ = cv2.split(hsv)
s_ = cv2.add(s_, 20)
v_ = cv2.add(v_, 20)
hsv_merged = cv2.merge([h_, s_, v_])
processed.append(cv2.cvtColor(hsv_merged, cv2.COLOR_HSV2BGR))
# 14. Rotation (+30°)
M1 = cv2.getRotationMatrix2D((TARGET_SIZE[0] // 2, TARGET_SIZE[1] // 2), 30, 1.0)
rot1 = cv2.warpAffine(img, M1, TARGET_SIZE)
processed.append(rot1)
# 15. Horizontal Flip
processed.append(cv2.flip(img, 1))
return processed
6. Processing Agriculture Dataset Categories
Here, our dataset has two categories:
You can add more classes easily—like scab, leaf spot, canker, anthracnose, etc.
# ⚙️ Process Each Image
categories = ['healthy', 'rot']
image_count = 0
for category in categories:
input_folder = os.path.join(input_path, category)
output_folder = os.path.join(output_path, category)
os.makedirs(output_folder, exist_ok=True)
for img_name in tqdm(os.listdir(input_folder), desc=f"Processing {category}"):
img_path = os.path.join(input_folder, img_name)
img = cv2.imread(img_path)
if img is None:
continue
img_resized = cv2.resize(img, TARGET_SIZE)
processed_imgs = advanced_preprocess_15(img_resized)
for i, proc_img in enumerate(processed_imgs):
out_filename = f"abf_{category}_{image_count:04d}_v{i+1}.jpg"
cv2.imwrite(os.path.join(output_folder, out_filename), proc_img)
image_count += 1
7. Exporting the Final Dataset
The complete processed dataset is compressed into a ZIP file for easy download and training.
# 📦 Zip the Folder
shutil.make_archive('/content/processed_apple_dataset', 'zip', output_path)
print("✅ Done: Resized, preprocessed, and zipped 15-image variants per raw input.")
Final Thoughts
This 15-step preprocessing pipeline is one of the most advanced setups for agriculture datasets. It enhances clarity, reduces noise, boosts contrast, extracts edges, and generates multiple variations of each image—making your training dataset stronger and more realistic.
Such transformations help deep-learning models detect crop diseases with better accuracy and generalization. Whether you're working on apple fruit rot detection, leaf disease classification, or smart farming applications, this workflow is designed to give you clean, consistent, and augmented images ready for AI models.
If you want, I can also prepare:
Comments
Post a Comment