#ifndef LIB_IMAGE_C
#define LIB_IMAGE_C 1
#pragma once
/*
*/

// types

	typedef struct {
		Vec2i size;
		void* data;
		bool HAS_TRANSPARENCY : 1; // this is helpful for drawing, as disabling transparency allows for much greater rendering speed
		bool HAS_SEMITRANSPARENCY : 1; // this is even more important, if the image only has 0% and 100% transparency (for example pixel art), pixel rendering can be sped up significantly (>50x) by omitting transparency multiplication
		int datasize;
	} Image;
	
	#pragma pack(push, 1)
	typedef union {
		u32 data; // use u32 integers whenever possible, for some reason it's much faster than using the Rgba8 struct
		struct { // these are basically just here to make it easy to set up the .data variable correctly
			u8 r;
			u8 g;
			u8 b;
			u8 a;
		};
	} Rgba8;
	#pragma pack(pop)
	
	// RGBA pixel values
	
		#define rgba8(rr, gg, bb, aa) ((Rgba8){ .r=rr, .g=gg, .b=bb, .a=aa })
		#define RGBA8_EMPTY   ((Rgba8){ .r=  0, .g=  0, .b=  0, .a=  0 })
		#define RGBA8_TRANSPARENT RGBA8_EMPTY
		#define RGBA8_BLACK   ((Rgba8){ .r=  0, .g=  0, .b=  0, .a=255 })
		#define RGBA8_WHITE   ((Rgba8){ .r=255, .g=255, .b=255, .a=255 })
		
		#define RGBA8_GRAY5   ((Rgba8){ .r= 13, .g= 13, .b= 13, .a=255 })
		#define RGBA8_GRAY10  ((Rgba8){ .r= 26, .g= 26, .b= 26, .a=255 })
		#define RGBA8_GRAY15  ((Rgba8){ .r= 38, .g= 38, .b= 38, .a=255 })
		#define RGBA8_GRAY20  ((Rgba8){ .r= 51, .g= 51, .b= 51, .a=255 })
		#define RGBA8_GRAY25  ((Rgba8){ .r= 64, .g= 64, .b= 64, .a=255 })
		#define RGBA8_GRAY30  ((Rgba8){ .r= 77, .g= 77, .b= 77, .a=255 })
		#define RGBA8_GRAY35  ((Rgba8){ .r= 89, .g= 89, .b= 89, .a=255 })
		#define RGBA8_GRAY40  ((Rgba8){ .r=102, .g=102, .b=102, .a=255 })
		#define RGBA8_GRAY45  ((Rgba8){ .r=115, .g=115, .b=115, .a=255 })
		#define RGBA8_GRAY50  ((Rgba8){ .r=127, .g=127, .b=127, .a=255 })
		#define RGBA8_GRAY55  ((Rgba8){ .r=140, .g=140, .b=140, .a=255 })
		#define RGBA8_GRAY60  ((Rgba8){ .r=153, .g=153, .b=153, .a=255 })
		#define RGBA8_GRAY65  ((Rgba8){ .r=166, .g=166, .b=166, .a=255 })
		#define RGBA8_GRAY70  ((Rgba8){ .r=179, .g=179, .b=179, .a=255 })
		#define RGBA8_GRAY75  ((Rgba8){ .r=191, .g=191, .b=191, .a=255 })
		#define RGBA8_GRAY80  ((Rgba8){ .r=204, .g=204, .b=204, .a=255 })
		#define RGBA8_GRAY90  ((Rgba8){ .r=230, .g=230, .b=230, .a=255 })
		#define RGBA8_GRAY95  ((Rgba8){ .r=242, .g=242, .b=242, .a=255 })
		
		#define RGBA8_RED     ((Rgba8){ .r=255, .g=  0, .b=  0, .a=255 })
		#define RGBA8_GREEN   ((Rgba8){ .r=  0, .g=255, .b=  0, .a=255 })
		#define RGBA8_BLUE    ((Rgba8){ .r=  0, .g=  0, .b=255, .a=255 })
		#define RGBA8_YELLOW  ((Rgba8){ .r=255, .g=255, .b=  0, .a=255 })
		#define RGBA8_CYAN    ((Rgba8){ .r=  0, .g=255, .b=255, .a=255 })
		#define RGBA8_MAGENTA ((Rgba8){ .r=255, .g=  0, .b=255, .a=255 })
	
	// BGRA pixel values
	
		#define bgra8(rr, gg, bb, aa) ((Rgba8){ .r=bb, .g=gg, .b=rr, .a=aa })
		#define BGRA8_EMPTY   ((Rgba8){ .r=  0, .g=  0, .b=  0, .a=  0 })
		#define BGRA8_TRANSPARENT BGRA8_EMPTY
		#define BGRA8_BLACK   ((Rgba8){ .r=  0, .g=  0, .b=  0, .a=255 })
		#define BGRA8_WHITE   ((Rgba8){ .r=255, .g=255, .b=255, .a=255 })
		
		#define BGRA8_GRAY5   ((Rgba8){ .r= 13, .g= 13, .b= 13, .a=255 })
		#define BGRA8_GRAY10  ((Rgba8){ .r= 26, .g= 26, .b= 26, .a=255 })
		#define BGRA8_GRAY15  ((Rgba8){ .r= 38, .g= 38, .b= 38, .a=255 })
		#define BGRA8_GRAY20  ((Rgba8){ .r= 51, .g= 51, .b= 51, .a=255 })
		#define BGRA8_GRAY25  ((Rgba8){ .r= 64, .g= 64, .b= 64, .a=255 })
		#define BGRA8_GRAY30  ((Rgba8){ .r= 77, .g= 77, .b= 77, .a=255 })
		#define BGRA8_GRAY35  ((Rgba8){ .r= 89, .g= 89, .b= 89, .a=255 })
		#define BGRA8_GRAY40  ((Rgba8){ .r=102, .g=102, .b=102, .a=255 })
		#define BGRA8_GRAY45  ((Rgba8){ .r=115, .g=115, .b=115, .a=255 })
		#define BGRA8_GRAY50  ((Rgba8){ .r=127, .g=127, .b=127, .a=255 })
		#define BGRA8_GRAY55  ((Rgba8){ .r=140, .g=140, .b=140, .a=255 })
		#define BGRA8_GRAY60  ((Rgba8){ .r=153, .g=153, .b=153, .a=255 })
		#define BGRA8_GRAY65  ((Rgba8){ .r=166, .g=166, .b=166, .a=255 })
		#define BGRA8_GRAY70  ((Rgba8){ .r=179, .g=179, .b=179, .a=255 })
		#define BGRA8_GRAY75  ((Rgba8){ .r=191, .g=191, .b=191, .a=255 })
		#define BGRA8_GRAY80  ((Rgba8){ .r=204, .g=204, .b=204, .a=255 })
		#define BGRA8_GRAY90  ((Rgba8){ .r=230, .g=230, .b=230, .a=255 })
		#define BGRA8_GRAY95  ((Rgba8){ .r=242, .g=242, .b=242, .a=255 })
		
		#define BGRA8_RED     ((Rgba8){ .r=  0, .g=  0, .b=255, .a=255 })
		#define BGRA8_GREEN   ((Rgba8){ .r=  0, .g=255, .b=  0, .a=255 })
		#define BGRA8_BLUE    ((Rgba8){ .r=255, .g=  0, .b=  0, .a=255 })
		#define BGRA8_YELLOW  ((Rgba8){ .r=  0, .g=255, .b=255, .a=255 })
		#define BGRA8_CYAN    ((Rgba8){ .r=255, .g=255, .b=  0, .a=255 })
		#define BGRA8_MAGENTA ((Rgba8){ .r=255, .g=  0, .b=255, .a=255 })

// stb

	#define STBI_NO_STDIO
	#define STB_IMAGE_IMPLEMENTATION
	// #define STBI_MALLOC mem_alloc
	// #define STBI_REALLOC mem_realloc
	// #define STBI_FREE mem_free
	#include "stb_image.h"

// create

	// load
	static Image image_new (Vec2i size) {
		int ds = (i32)nearest_power_of_two_not_lower_u32(size.w * size.h * 4);
		Image image = {
			.size = size,
			.datasize = ds,
		};
		image.data = mem_alloc(image.datasize);
		
		return image;
	}
	static void image_free (Image* image) {
		if (image->data) {
			mem_free(image->data);
		}
		*image = (Image){0};
	}
	static int image_set_rgba8_pixel_data (Image* image, int datasize, u8* data) {
		if (image->datasize < datasize) return 1;
		mem_copy_to(image->data, datasize, data);
		
		// check transparency
		u8* px = image->data;
		for (int y=0; y<image->size.h; y++) {
			for (int x=0; x<image->size.w; x++) {
				if (px[3] < 255) {
					image->HAS_TRANSPARENCY = TRUE;
					if (px[3] > 0) {
						image->HAS_SEMITRANSPARENCY = TRUE;
						y = image->size.h;
						break;
					}
				}
				
				px += sizeof(u32);
			}
		}
		return 0;
	}
	static inline int image_load_from_data (Image* image, int datalen, void* data) {
		if (!data) return 1;
		if (datalen <= 8) return 2;
		
		int channels = 0;
		int w = 0;
		int h = 0;
		u8* stbd = (u8*)stbi_load_from_memory(data, datalen, &w, &h, &channels, 4);
		if (!stbd) return 3;
		
		*image = image_new(vec2i(w, h));
		int e = image_set_rgba8_pixel_data(image, w*h*4, stbd);
		stbi_image_free(stbd);
		
		return e;
	}
	static int image_load_from_path (Image* image, char* filepath) {
		Os_file file = {0};
		int e = os_load_entire_file(filepath, &file);
		if (e) return e;
		
		e = image_load_from_data(image, file.size, file.data);
		os_free_file_data(&file);
		
		return e;
	}
	
	// channel swap
	static inline void image_swap_channels (Image* image, int a, int b) {
		int len = image->size.y * image->size.w * sizeof(u32);
		
		// rearrange pixels
		u8* data = image->data;
		for (int i=0; i<len; i+=4) {
			u8 temp = data[a];
			data[a] = data[b];
			data[b] = temp;
			
			data += 4;
		}
	}
	static inline void image_rgba8_to_bgra8 (Image* image) {
		image_swap_channels(image, 0, 2);
	}
	static inline void image_bgra8_to_rgba8 (Image* image) {
		image_swap_channels(image, 0, 2);
	}
	
	static void image_flip_vertically (Image* image) {
		u32* data = mem_alloc(image->size.x * image->size.y * 4);
		for (int y=0; y<image->size.h; y++) {
			for (int x=0; x<image->size.w; x++) {
				u32* src = image->data;
					src += (image->size.h-y-1)*image->size.w + x;
				u32* dest = data;
					dest += y*image->size.w + x;
				*dest = *src;
			}
		}
		mem_copy_to(image->data, image->size.x * image->size.y * 4, data);
		mem_free(data);
	}
	static void image_toggle_transparency (Image* image, bool transparency, bool semitransparency) {
		image->HAS_TRANSPARENCY = transparency || semitransparency;
		image->HAS_SEMITRANSPARENCY = semitransparency;
	}
	
	// resize
	static inline void image_resize_data (Image* this, Vec2i size) {
		// note: this totally ignores the pixels, it only changes the struct info and allocated data size
		this->size = size;
		
		int newsize = (i32)nearest_power_of_two_not_lower_u32(size.w * size.h * 4);
		if (this->datasize < newsize) {
			this->data = mem_realloc(this->data, newsize);
			this->datasize = newsize;
		}
	}

// pixels

	static inline Rgba8* image_get_rgba8_pixel_dangerously (Image* this, int x, int y) {
		return this->data + (y*this->size.w + x)*sizeof(u32);
	}
	static inline Rgba8* image_get_rgba8_pixel (Image* this, int x, int y) {
		if ( x<0 && y<0 && x>=this->size.w && y>=this->size.h ) return NULL;
		
		return this->data + (y*this->size.w + x)*sizeof(u32);
	}
	
	static inline Rgba8 rgba8_mix_colors (Rgba8 x, Rgba8 y, f32 ratio) {
		// blends linearly between 2 colors. This looks a bit unexpected because transparent pixels may have colors in them, and you don't want to blend those.
		if (x.a == 0) {
			f32 ratioi = 1.0 - ratio;
			int a = round((f32)y.a * ratioi);
			y.a = (a > 255) ? 255 : a;
			return y;
		}
		else if (y.a == 0) {
			int a = round((f32)x.a * ratio);
			x.a = (a > 255) ? 255 : a;
			return x;
		}
		else {
			f32 ratioi = 1.0 - ratio;
			int r = round((f32)x.r * ratio + (f32)y.r * ratioi);
			int g = round((f32)x.g * ratio + (f32)y.g * ratioi);
			int b = round((f32)x.b * ratio + (f32)y.b * ratioi);
			int a = round((f32)x.a * ratio + (f32)y.a * ratioi);
			x.r = (r > 255) ? 255 : r;
			x.g = (g > 255) ? 255 : g;
			x.b = (b > 255) ? 255 : b;
			x.a = (a > 255) ? 255 : a;
			return x;
		}
	}
	static inline void rgba8_overlay_alpha_blend (Rgba8* dest, Rgba8* src) {
		// this is mostly for reference, it probably doesn't optimize well and certainly doesn't vectorize well. Consider to inline/optimize/vectorize this code in real functions that do fills etc.
		if (src->a == 255 || dest->a == 0) { // no need for blending if top is fully opaque or bottom is fully transparent.
			dest->data = src->data;
			return;
		}
		else if (src->a == 0) {
			return;
		}
		else {
			f32 src_rf = src->r;
			f32 src_gf = src->g;
			f32 src_bf = src->b;
			f32 src_af = src->a;
			f32 dest_rf = dest->r;
			f32 dest_gf = dest->g;
			f32 dest_bf = dest->b;
			f32 dest_af = dest->a;
			
			f32 src_afp = src_af / 255.0;
			f32 dest_afp = dest_af / 255.0;
			
			f32 dest_affinalp = src_afp + (1.0-src_afp)*dest_afp; // same result when you do this the other way round:  dest_afp + (1.0-dest_afp)*src_afp;
			dest_af = dest_affinalp * 255.0;
			
			f32 src_amod = src_afp/dest_affinalp;
			f32 dest_amod = 1.0 - src_amod;
			
			src_rf *= src_amod;
			src_gf *= src_amod;
			src_bf *= src_amod;
			
			dest_rf *= dest_amod;
			dest_gf *= dest_amod;
			dest_bf *= dest_amod;
			
			dest_rf += src_rf;
			dest_gf += src_gf;
			dest_bf += src_bf;
			
			dest->r = (dest_rf > 255) ? 255 : (u8)dest_rf;
			dest->g = (dest_gf > 255) ? 255 : (u8)dest_gf;
			dest->b = (dest_bf > 255) ? 255 : (u8)dest_bf;
			dest->a = (dest_af > 255) ? 255 : (u8)dest_af;
		}
	}
	
	static inline void image_replace_pixel_dangerously (Image* this, int x, int y, Rgba8 pixel) {
		*(u32*)(  this->data + (y*this->size.w + x)*sizeof(u32)  ) = pixel.data;
	}
	static inline void image_replace_pixel (Image* this, int x, int y, Rgba8 pixel) {
		if ( x>=0 && y>=0 && x<this->size.w && y<this->size.h ) {
			*(u32*)(  this->data + (y*this->size.w + x)*sizeof(u32)  ) = pixel.data;
		}
	}
	static inline void image_draw_pixel_dangerously (Image* this, int x, int y, Rgba8 pixel) {
		rgba8_overlay_alpha_blend(this->data + (y*this->size.w + x)*sizeof(u32), &pixel);
	}
	static inline void image_draw_pixel (Image* this, int x, int y, Rgba8 pixel) {
		if ( x>=0 && y>=0 && x<this->size.w && y<this->size.h ) {
			rgba8_overlay_alpha_blend(this->data + (y*this->size.w + x)*sizeof(u32), &pixel);
		}
	}

// fill

	static void image_clear_with_color (Image* this, Rgba8 color) {
		// note: the SIMD version is only about 5-15% faster with -O3, but it's 4x faster without optimization
		#if ENABLED_SIMD
			if (this->size.w*this->size.h < 160) {
				u32* dest = this->data;
				int pixelcount = this->size.h*this->size.w;
				while (pixelcount) {
					*dest = color.data;
					dest ++;
					pixelcount --;
				}
			}
			else  {
				int aligned_start = 0;
				u32* mem = this->data;
				while ((uintptr_t)mem % 32) { mem ++; aligned_start ++; } // find byte that's aligned to 32 bytes
				
				int aligned_length = this->size.w*this->size.h - aligned_start;
				aligned_length /= 32;
				aligned_length *= 32;
				
				int aligned_end = aligned_start + aligned_length;
				
				u32 pixel = color.data;
				__m256i simdpixels = _mm256_set1_epi32(pixel);
				
				// fill
				mem = this->data;
				int x = 0;
				for (; x<aligned_start; x++) {
					mem[x] = pixel;
				}
				__m256i* sp = (__m256i*)(mem+x);
				for (; x<aligned_end; x+=8, sp++) {
					*sp = simdpixels;
				}
				for (; x<this->size.w*this->size.h; x++) {
					mem[x] = pixel;
				}
			}
		#else
			u32* dest = this->data;
			int pixelcount = this->size.h*this->size.w;
			while (pixelcount) {
				*dest = color.data;
				dest ++;
				pixelcount --;
			}
		#endif
	}

// lines

	static inline void image_replace_pixel_row_dangerously (Image* this, Vec2i pos, int width, Rgba8 color) {
		u32* dest = this->data + (pos.y*this->size.w + pos.x) * sizeof(u32);
		
		while (width) {
			*dest = color.data;
			dest ++;
			width --;
		}
	}
	static inline void image_replace_pixel_row (Image* this, Vec2i pos, int width, Rgba8 color) {
		// fix bounds
		if (pos.y < 0 || pos.y > this->size.h) {
			return;
		}
		if (pos.x < 0) {
			width -= -pos.x;
			pos.x = 0;
		}
		if (pos.x+width > this->size.w) {
			width = this->size.w-pos.x;
		}
		if (width <= 0) {
			return;
		}
		
		image_replace_pixel_row_dangerously(this, pos, width, color);
	}
	static inline void image_draw_pixel_row_dangerously (Image* this, Vec2i pos, int width, Rgba8 color) {
		u32* dest = this->data + (pos.y*this->size.w + pos.x) * sizeof(u32);
		
		while (width) {
			rgba8_overlay_alpha_blend((Rgba8*)dest, &color);
			dest ++;
			width --;
		}
	}
	static inline void image_draw_pixel_row (Image* this, Vec2i pos, int width, Rgba8 color) {
		// fix bounds
		if (pos.y < 0 || pos.y > this->size.h) {
			return;
		}
		if (pos.x < 0) {
			width -= -pos.x;
			pos.x = 0;
		}
		if (pos.x+width > this->size.w) {
			width = this->size.w-pos.x;
		}
		if (width <= 0) {
			return;
		}
		
		image_draw_pixel_row_dangerously(this, pos, width, color);
	}
	
	static void image_replace_line (Image* this, Vec2i from, Vec2i to, Rgba8 color) {
		// straight vertical line
		if (from.x == to.x) {
			if (from.x < 0 || from.x >= this->size.w) return;
			
			int yfrom = max(0, min(from.y, to.y));
			int yto = min(this->size.h, max(from.y, to.y));
			for (int y=yfrom; y<yto; y++) {
				image_replace_pixel_dangerously(this, from.x, y, color);
			}
		}
		// straight horizontal line
		else if (from.y == to.y) {
			if (from.y < 0 || from.y >= this->size.h) return;
			
			int xfrom = max(0, min(from.x, to.x));
			int xto = min(this->size.w, max(from.x, to.x));
			image_replace_pixel_row_dangerously(this, vec2i(xfrom, from.y), xto-xfrom, color);
		}
		// bresenham line drawing algorithm
		else {
			int dx = abs(to.x - from.x);
			int dy = abs(to.y - from.y);
			int xdir = (from.x < to.x) ? 1 : -1;
			int ydir = (from.y < to.y) ? 1 : -1;
			int diff = dx - dy;
			int diff2;
			
			while (1) {
				// note: you can't know if the line is coming from outside or going outside (or both), so you kinda have to check every pixel.
				image_replace_pixel(this, from.x, from.y, color);
				
				if ((from.x==to.x) && (from.y==to.y)) break;
				diff2 = diff*2;
				if (diff2 > -dy){
					diff -= dy;
					from.x += xdir;
				}
				if (diff2 < dx){
					diff += dx;
					from.y += ydir;
				}
			}
		}
	}
	static void _image_draw_line (Image* this, Vec2i from, Vec2i to, Rgba8 color) {
		// straight vertical line
		if (from.x == to.x) {
			if (from.x < 0 || from.x >= this->size.w) return;
			
			int yfrom = max(0, min(from.y, to.y));
			int yto = min(this->size.h, max(from.y, to.y));
			for (int y=yfrom; y<yto; y++) {
				rgba8_overlay_alpha_blend(this->data + (y*this->size.w + from.x)*sizeof(u32), &color);
			}
		}
		// straight horizontal line
		else if (from.y == to.y) {
			if (from.y < 0 || from.y >= this->size.h) return;
			
			int xfrom = max(0, min(from.x, to.x));
			int xto = min(this->size.w, max(from.x, to.x));
			image_draw_pixel_row_dangerously(this, vec2i(xfrom, from.y), xto-xfrom, color);
		}
		// bresenham line drawing algorithm
		else {
			int dx = abs(to.x - from.x);
			int dy = abs(to.y - from.y);
			int xdir = (from.x < to.x) ? 1 : -1;
			int ydir = (from.y < to.y) ? 1 : -1;
			int diff = dx - dy;
			int diff2;
			
			while (1) {
				// note: you can't know if the line is coming from outside or going outside (or both), so you kinda have to check every pixel.
				rgba8_overlay_alpha_blend(this->data + (from.y*this->size.w + from.x)*sizeof(u32), &color);
				
				if ((from.x==to.x) && (from.y==to.y)) break;
				diff2 = diff*2;
				if (diff2 > -dy){
					diff -= dy;
					from.x += xdir;
				}
				if (diff2 < dx){
					diff += dx;
					from.y += ydir;
				}
			}
		}
	}
	static inline void image_draw_line (Image* this, Vec2i from, Vec2i to, Rgba8 color) {
		if (color.a == 0) {
			// do nothin
		}
		else if (color.a == 255) {
			image_replace_line(this, from, to, color);
		}
		else {
			_image_draw_line(this, from, to, color);
		}
	}

// rectangles

	static inline void image_replace_rect_dangerously (Image* this, Bounds2i bounds, Rgba8 color) {
		int x_end = bounds.x+bounds.w;
		int y_end = bounds.y+bounds.h;
		int width = x_end - bounds.x;
		for (int y=bounds.y; y<y_end; y++) {
			image_replace_pixel_row_dangerously(this, vec2i(bounds.x, y), width, color);
		}
	}
	static inline void image_draw_rect_dangerously (Image* this, Bounds2i bounds, Rgba8 color) {
		int x_end = bounds.x+bounds.w;
		int y_end = bounds.y+bounds.h;
		int width = x_end - bounds.x;
		for (int y=bounds.y; y<y_end; y++) {
			image_draw_pixel_row_dangerously(this, vec2i(bounds.x, y), width, color);
		}
	}
	static inline void image_replace_rect (Image* this, Bounds2i bounds, Rgba8 color) {
		// fix bounds
		if (bounds.w < 0) {
			bounds.w = -bounds.w;
			bounds.x -= bounds.w;
		}
		if (bounds.h < 0) {
			bounds.h = -bounds.h;
			bounds.y -= bounds.h;
		}
		if (bounds.x < 0) {
			bounds.w += bounds.x;
			bounds.x = 0;
		}
		if (bounds.x+bounds.w > this->size.w) {
			bounds.w -= (bounds.x+bounds.w - this->size.w);
		}
		if (bounds.y < 0) {
			bounds.h += bounds.y;
			bounds.y = 0;
		}
		if (bounds.y+bounds.h > this->size.h) {
			bounds.h -= (bounds.y+bounds.h - this->size.h);
		}
		if (bounds.x >= this->size.w || bounds.y >= this->size.h || bounds.w <= 0 || bounds.h <= 0) return;
		
		#if ENABLED_SIMD
			if (bounds.w < 160) {
				image_replace_rect_dangerously(this, bounds, color);
			}
			else {
				image_replace_rect_dangerously_simd(this, bounds, color);
			}
		#else
			image_replace_rect_dangerously(this, bounds, color);
		#endif
	}
	static inline void image_draw_rect (Image* this, Bounds2i bounds, Rgba8 color) {
		if (color.a == 255) {
			image_replace_rect(this, bounds, color);
			return;
		}
		else if (color.a == 0) {
			return;
		}
		
		// fix bounds
		if (bounds.w < 0) {
			bounds.w = -bounds.w;
			bounds.x -= bounds.w;
		}
		if (bounds.h < 0) {
			bounds.h = -bounds.h;
			bounds.y -= bounds.h;
		}
		if (bounds.x < 0) {
			bounds.w += bounds.x;
			bounds.x = 0;
		}
		if (bounds.x+bounds.w > this->size.w) {
			bounds.w -= (bounds.x+bounds.w - this->size.w);
		}
		if (bounds.y < 0) {
			bounds.h += bounds.y;
			bounds.y = 0;
		}
		if (bounds.y+bounds.h > this->size.h) {
			bounds.h -= (bounds.y+bounds.h - this->size.h);
		}
		if (bounds.x >= this->size.w || bounds.y >= this->size.h || bounds.w <= 0 || bounds.h <= 0) return;
		
		image_draw_rect_dangerously(this, bounds, color);
	}
	
	// negative thickness is inside the bounds, positive is outside.
	static inline void image_replace_rect_stroke (Image* this, Bounds2i bounds, int thickness, Rgba8 color) {
		if (thickness == 0) return;
		if (bounds.w <= thickness*2 || bounds.h <= thickness*2) {
			image_replace_rect(this, bounds, color);
		}
		if (thickness < 0) {
			thickness = -thickness;
			image_replace_rect(this, bounds2i(bounds.x,						bounds.y,						bounds.w,					thickness),				color); // top
			image_replace_rect(this, bounds2i(bounds.x,						bounds.y+bounds.h-thickness,	bounds.w,					thickness),				color); // bottom
			image_replace_rect(this, bounds2i(bounds.x,						bounds.y+thickness,				thickness,					bounds.h-thickness*2),	color); // left
			image_replace_rect(this, bounds2i(bounds.x+bounds.w-thickness,	bounds.y+thickness,				thickness,					bounds.h-thickness*2),	color); // right
		}
		else {
			image_replace_rect(this, bounds2i(bounds.x-thickness,			bounds.y-thickness,				bounds.w+thickness*2,		thickness),				color); // top
			image_replace_rect(this, bounds2i(bounds.x-thickness,			bounds.y+bounds.h,				bounds.w+thickness*2,		thickness),				color); // bottom
			image_replace_rect(this, bounds2i(bounds.x-thickness,			bounds.y,						thickness,					bounds.h),				color); // left
			image_replace_rect(this, bounds2i(bounds.x+bounds.w,			bounds.y,						thickness,					bounds.h),				color); // right
		}
	}
	static inline void image_draw_rect_stroke (Image* this, Bounds2i bounds, int thickness, Rgba8 color) {
		if (thickness == 0) return;
		if (bounds.w <= thickness*2 || bounds.h <= thickness*2) {
			image_draw_rect(this, bounds, color);
		}
		if (thickness < 0) {
			thickness = -thickness;
			image_draw_rect(this, bounds2i(bounds.x,						bounds.y,						bounds.w,					thickness),				color); // top
			image_draw_rect(this, bounds2i(bounds.x,						bounds.y+bounds.h-thickness,	bounds.w,					thickness),				color); // bottom
			image_draw_rect(this, bounds2i(bounds.x,						bounds.y+thickness,				thickness,					bounds.h-thickness*2),	color); // left
			image_draw_rect(this, bounds2i(bounds.x+bounds.w-thickness,		bounds.y+thickness,				thickness,					bounds.h-thickness*2),	color); // right
		}
		else {
			image_draw_rect(this, bounds2i(bounds.x-thickness,				bounds.y-thickness,				bounds.w+thickness*2,		thickness),				color); // top
			image_draw_rect(this, bounds2i(bounds.x-thickness,				bounds.y+bounds.h,				bounds.w+thickness*2,		thickness),				color); // bottom
			image_draw_rect(this, bounds2i(bounds.x-thickness,				bounds.y,						thickness,					bounds.h),				color); // left
			image_draw_rect(this, bounds2i(bounds.x+bounds.w,				bounds.y,						thickness,					bounds.h),				color); // right
		}
	}

// images

	static void image_replace_image_cropped_dangerously (Image* this, Image* img, Bounds2i crop, Vec2i pos) {
		int x = pos.x;
		int y = pos.y;
		int w = crop.w;
		int h = crop.h;
		int cx = crop.x;
		int cy = crop.y;
		
		// #if NOTE: the second loop is like 10 times faster, however when compiling with -O3 on release mode, the first one becomes about 20-50% faster.
		#if BUILD_RELEASE
		for (int i=0; i<h; i++, cy++, y++) {
			u32* dest = this->data + (y*this->size.w + x)*sizeof(u32);
			u32* src = img->data + (cy*img->size.w + cx)*sizeof(u32);
			for (int ii=0; ii<w; ii++) {
				dest[ii] = src[ii];
			}
		}
		#else
		for (int i=0; i<h; i++, cy++, y++) {
			u32* dest = this->data + (y*this->size.w + x)*sizeof(Rgba8);
			u32* src = img->data + (cy*img->size.w + cx)*sizeof(Rgba8);
			mem_copy_to(dest, w*sizeof(Rgba8), src);
		}
		#endif
	}
	static void image_replace_image_cropped (Image* this, Image* img, Bounds2i crop, Vec2i pos) {
		// fix crop so it doesn't go outside source image
		if (crop.x < 0) {
			crop.w -= -crop.x;
			crop.x = 0;
		}
		if (crop.y < 0) {
			crop.h -= -crop.y;
			crop.y = 0;
		}
		if (crop.x+crop.w > img->size.w) {
			crop.w = img->size.w - crop.x;
		}
		if (crop.y+crop.h > img->size.h) {
			crop.h = img->size.h - crop.y;
		}
		
		// fix crop so it doesn't go outside dest image
		if (pos.x < 0) {
			crop.w -= -pos.x;
			crop.x += -pos.x;
			pos.x = 0;
		}
		if (pos.y < 0) {
			crop.h -= -pos.y;
			crop.y += -pos.y;
			pos.y = 0;
		}
		if (pos.x+crop.w > this->size.w) {
			crop.w = this->size.w - pos.x;
		}
		if (pos.y+crop.h > this->size.h) {
			crop.h = this->size.h - pos.y;
		}
		
		// exit if the crop ended up clipping out
		if (crop.w <= 0 || crop.h <= 0) return;
		
		image_replace_image_cropped_dangerously(this, img, crop, pos);
	}
	static inline void image_replace_image (Image* this, Image* img, Vec2i pos) {
		image_replace_image_cropped(this, img, bounds2i(0, 0, img->size.w, img->size.h), pos);
	}
	
	static void image_draw_image_cropped_dangerously (Image* this, Image* img, Bounds2i crop, Vec2i pos, f32 opacity) {
		int x = pos.x;
		int y = pos.y;
		int w = crop.w;
		int h = crop.h;
		int cx = crop.x;
		int cy = crop.y;
		
		if (opacity < 1) {
			if (opacity <= 0) return;
			// global transparency blend
			Rgba8* src;
			Rgba8* dest;
			for (int ypx=0; ypx<h; ypx++) {
				dest = this->data + ((y+ypx)*this->size.w + x)*sizeof(Rgba8);
				src = img->data + ((cy+ypx)*img->size.w + cx)*sizeof(Rgba8);
				for (int xpx=0; xpx<w; xpx++) {
					Rgba8 px = *src;
					px.a *= opacity;
					
					rgba8_overlay_alpha_blend(dest, &px);
					
					dest ++;
					src ++;
				}
			}
		}
		else {
			// pixel-based transparency
			if (img->HAS_SEMITRANSPARENCY) {
				// semitransparency, do alpha multiplication
				for (int ypx=0; ypx<h; ypx++) {
					Rgba8* dest = this->data + ((y+ypx)*this->size.w + x)*sizeof(Rgba8);
					Rgba8* src = img->data + ((cy+ypx)*img->size.w + cx)*sizeof(Rgba8);
					for (int xpx=0; xpx<w; xpx++) {
						rgba8_overlay_alpha_blend(dest, src);
						dest ++;
						src ++;
					}
				}
			}
			else if (img->HAS_TRANSPARENCY) {
				// treshold transparency, copy pixels if alpha is >= 128
				for (int ypx=0; ypx<h; ypx++) {
					Rgba8* dest = this->data + ((y+ypx)*this->size.w + x)*sizeof(Rgba8);
					Rgba8* src = img->data + ((cy+ypx)*img->size.w + cx)*sizeof(Rgba8);
					for (int xpx=0; xpx<w; xpx++) {
						// if (src->a & 0x80) { // which is faster?
						if (src->a >= 128) {
							*dest = *src;
						}
						dest ++;
						src ++;
					}
				}
			}
			else {
				// no transparency, just copy the pixels from horizontal strips
				// #if NOTE: the second loop is like 10 times faster, however when compiling with -O3 on release mode, the first one becomes about 20-50% faster.
				#if BUILD_RELEASE
				for (int i=0; i<h; i++, cy++, y++) {
					u32* dest = this->data + (y*this->size.w + x)*sizeof(u32);
					u32* src = img->data + (cy*img->size.w + cx)*sizeof(u32);
					for (int ii=0; ii<w; ii++) {
						dest[ii] = src[ii];
					}
				}
				#else
				for (int i=0; i<h; i++, cy++, y++) {
					u32* dest = this->data + (y*this->size.w + x)*sizeof(u32);
					u32* src = img->data + (cy*img->size.w + cx)*sizeof(u32);
					mem_copy_to(dest, w*sizeof(u32), src);
				}
				#endif
			}
		}
	}
	static void image_draw_image_cropped (Image* this, Image* img, Bounds2i crop, Vec2i pos, f32 opacity) {
		// fix crop so it doesn't go outside source image
		if (crop.x < 0) {
			crop.w -= -crop.x;
			crop.x = 0;
		}
		if (crop.y < 0) {
			crop.h -= -crop.y;
			crop.y = 0;
		}
		if (crop.x+crop.w > img->size.w) {
			crop.w = img->size.w - crop.x;
		}
		if (crop.y+crop.h > img->size.h) {
			crop.h = img->size.h - crop.y;
		}
		
		// fix crop so it doesn't go outside dest image
		if (pos.x < 0) {
			crop.w -= -pos.x;
			crop.x += -pos.x;
			pos.x = 0;
		}
		if (pos.y < 0) {
			crop.h -= -pos.y;
			crop.y += -pos.y;
			pos.y = 0;
		}
		if (pos.x+crop.w > this->size.w) {
			crop.w = this->size.w - pos.x;
		}
		if (pos.y+crop.h > this->size.h) {
			crop.h = this->size.h - pos.y;
		}
		
		// exit if the crop ended up clipping out
		if (crop.w <= 0 || crop.h <= 0) return;
		
		image_draw_image_cropped_dangerously(this, img, crop, pos, opacity);
	}
	static inline void image_draw_image (Image* this, Image* img, Vec2i pos) {
		image_draw_image_cropped(this, img, bounds2i(0, 0, img->size.w, img->size.h), pos, 1.0);
	}
	
	static void image_draw_image_smoothly_cropped_dangerously (Image* this, Image* img, Bounds2i crop, Vec2f pos, Vec2f zoom) {
		// TODO: downscaling
		Bounds2i b;
		b.x = floor(pos.x);
		b.y = floor(pos.y);
		b.w = ceil((pos.x + (f32)crop.w * zoom.x) - b.x);
		b.h = ceil((pos.y + (f32)crop.h * zoom.y) - b.y);
		f32 firstoffsetx = (f32)b.x - pos.x;
		f32 firstoffsety = (f32)b.y - pos.y;
		
		// out of target bounds fix
		if (b.x+b.w > this->size.w) b.w = this->size.w - b.x;
		if (b.y+b.h > this->size.h) b.h = this->size.h - b.y;
		
		Rgba8 transparent = RGBA8_TRANSPARENT;
		Rgba8* getpx (Image* img, int x, int y) {
			if (y < crop.y || y >= crop.y+crop.h) return &transparent;
			if (x < crop.x || x >= crop.x+crop.w) return &transparent;
			return image_get_rgba8_pixel_dangerously(img, x, y);
		}
		for (int y=0; y<b.h; y++) {
			f32 realsrcy = (f32)y + firstoffsety;
			f32 srcpxy = floor((realsrcy) / zoom.y);
			f32 centery = (srcpxy+1) * zoom.y - realsrcy;
			srcpxy += crop.y;
			for (int x=0; x<b.w; x++) {
				f32 realsrcx = (f32)x + firstoffsetx;
				f32 srcpxx = floor((realsrcx) / zoom.x);
				f32 centerx = (srcpxx+1) * zoom.x - realsrcx;
				srcpxx += crop.x;
				
				Rgba8* dst = image_get_rgba8_pixel_dangerously(this, x+b.x, y+b.y);
				if (centerx < 1 && centery < 1) {
					Rgba8* src1 = getpx(img, srcpxx, srcpxy);
					Rgba8* src2 = getpx(img, srcpxx+1, srcpxy);
					Rgba8* src3 = getpx(img, srcpxx, srcpxy+1);
					Rgba8* src4 = getpx(img, srcpxx+1, srcpxy+1);
					Rgba8 top = rgba8_mix_colors(*src1, *src2, centerx);
					Rgba8 bot = rgba8_mix_colors(*src3, *src4, centerx);
					Rgba8 result = rgba8_mix_colors(top, bot, centery);
					rgba8_overlay_alpha_blend(dst, &result);
				}
				else if (centerx < 1) {
					Rgba8* src1 = getpx(img, srcpxx, srcpxy);
					Rgba8* src2 = getpx(img, srcpxx+1, srcpxy);
					Rgba8 result = rgba8_mix_colors(*src1, *src2, centerx);
					rgba8_overlay_alpha_blend(dst, &result);
				}
				else if (centery < 1) {
					Rgba8* src1 = getpx(img, srcpxx, srcpxy);
					Rgba8* src2 = getpx(img, srcpxx, srcpxy+1);
					Rgba8 result = rgba8_mix_colors(*src1, *src2, centery);
					rgba8_overlay_alpha_blend(dst, &result);
				}
				else {
					Rgba8* result = getpx(img, srcpxx, srcpxy);
					rgba8_overlay_alpha_blend(dst, result);
				}
			}
		}
	}
	static void image_draw_image_smoothly_cropped (Image* this, Image* img, Bounds2i crop, Vec2f pos, Vec2f zoom) {
		// fix crop so it doesn't go outside source image
		if (crop.x < 0) {
			crop.w -= -crop.x;
			crop.x = 0;
		}
		if (crop.y < 0) {
			crop.h -= -crop.y;
			crop.y = 0;
		}
		if (crop.x+crop.w > img->size.w) {
			crop.w = img->size.w - crop.x;
		}
		if (crop.y+crop.h > img->size.h) {
			crop.h = img->size.h - crop.y;
		}
		
		image_draw_image_smoothly_cropped_dangerously(this, img, crop, pos, zoom);
	}
	static inline void image_draw_image_smoothly (Image* this, Image* img, Vec2f pos, Vec2f zoom) {
		image_draw_image_smoothly_cropped_dangerously(this, img, bounds2i(0, 0, img->size.w, img->size.h), pos, zoom);
	}

// save

	#pragma pack(push, 1)
	typedef struct {
		char type[2];		// The characters "BM"
		u32 size;			// The size of the file in bytes
		u16 reserved1;		// Unused - must be zero
		u16 reserved2;		// Unused - must be zero
		u32 offBits;		// Offset to start of Pixel Data
	} Image_bmpfileheader;
	
	typedef struct {
		u32 headerSize;		// Header Size - Must be at least 40
		i32 width;			// Image width in pixels
		i32 height;			// Image height in pixels
		u16 planes;			// Must be 1
		u16 bitCount;		// Bits per pixel - 1, 4, 8, 16, 24, or 32
		u32 compression;	// Compression type (0 = uncompressed)
		u32 sizeImage;		// Image Size - may be zero for uncompressed images
		u32 xPelsPerMeter;	// Preferred resolution in pixels per meter
		u32 yPelsPerMeter;	// Preferred resolution in pixels per meter
		u32 clrUsed;		// Number Color Map entries that are actually used
		u32 clrImportant;	// Number of significant colors
	} Image_bmpimageheader_win1;
	
	typedef struct {
		Image_bmpfileheader fh;
		Image_bmpimageheader_win1 ih;
		u8 data[];
	} Image_bmp;
	#pragma pack(pop)

	static void image_save_as_bmp_file (Image* img, char* path) {
		Image_bmp* bmp = mem_alloc_zeroed(sizeof(Image_bmp) + (img->size.w*img->size.h*3));
		
		bmp->fh.type[0] = 'B';
		bmp->fh.type[1] = 'M';
		bmp->fh.size = sizeof(Image_bmp) + (img->size.w*img->size.h*3);
		bmp->fh.offBits = sizeof(Image_bmp);
		
		bmp->ih.headerSize = sizeof(Image_bmpimageheader_win1);
		bmp->ih.width = img->size.w;
		bmp->ih.height = -img->size.h;
		bmp->ih.planes = 1;
		bmp->ih.bitCount = 24;
		
		for (int y=0; y<img->size.h; y++) {
			for (int x=0; x<img->size.w; x++) {
				Rgba8* src = image_get_rgba8_pixel_dangerously(img, x, y);
				u8* dst = bmp->data + (y*bmp->ih.width + x)*3;
				dst[0] = src->b;
				dst[1] = src->g;
				dst[2] = src->r;
			}
		}
		os_write_to_file(path, bmp->fh.size, bmp);
		
		mem_free(bmp);
	}


// include guard
#endif