#!/usr/bin/env bash

ROOT_DIR="."
DRY_RUN=false

print_help() {
	cat <<'EOF'
Usage: dedup_symlink.sh [OPTIONS] [DIRECTORY]

Options:
  -h, --help      Show this help message and exit.
  -T, --test      Create a test environment with some duplicates and run.
  -n, --dry-run   Show what would be done without making any changes.

Arguments:
  DIRECTORY       The directory to deduplicate. Defaults to current directory.
EOF
}

detect_cpu_count() {
	local cpu_count=1
	if command -v nproc &>/dev/null; then
		cpu_count="$(nproc)"
	fi
	echo "$cpu_count"
}

check_dependencies() {
	if ! command -v b3sum &>/dev/null; then
		echo "ERROR: 'b3sum' not found. Please install Blake3."
		exit 2
	fi

	if ! find --version 2>&1 | grep -qi 'GNU findutils'; then
		echo "ERROR: This script requires GNU find (for -printf)."
		exit 2
	fi

	if ! xargs --version 2>&1 | grep -qi 'GNU'; then
		echo "ERROR: This script requires GNU xargs (for -P)."
		exit 2
	fi
}

setup_test_environment() {
	local TEST_DIR
	TEST_DIR="$(mktemp -d)"
	local PASS=true

	mkdir -p "$TEST_DIR/subdir1" "$TEST_DIR/subdir2" "$TEST_DIR/subdir3"
	mkdir -p "$TEST_DIR/deeper/inner"

	# Create duplicates of "a" in different dirs and with different mtimes/permissions
	echo "a" > "$TEST_DIR/file1.txt"
	sleep 1
	echo "a" > "$TEST_DIR/file9.txt"
	sleep 1
	echo "a" > "$TEST_DIR/subdir1/file2.txt"
	sleep 1
	echo "a" > "$TEST_DIR/subdir1/file5.txt"
	chmod 600 "$TEST_DIR/subdir1/file5.txt"
	touch -d "yesterday" "$TEST_DIR/subdir1/file2.txt"
	ln "$TEST_DIR/file1.txt" "$TEST_DIR/hardlink_to_a" # hard link

	# Create duplicates of "b" in different locations
	echo "b" > "$TEST_DIR/subdir2/file3.txt"
	echo "b" > "$TEST_DIR/subdir3/file4.txt"
	echo "b" > "$TEST_DIR/deeper/inner/fileb.txt"

	# Group "c": unique file and also a large file and empty file
	echo "c" > "$TEST_DIR/unique.txt"
	head -c 100000 /dev/urandom > "$TEST_DIR/large.bin"
	cp "$TEST_DIR/large.bin" "$TEST_DIR/subdir2/large_copy.bin"
	: > "$TEST_DIR/emptyfile"
	cp "$TEST_DIR/emptyfile" "$TEST_DIR/subdir1/emptyfile_copy"

	# Binary and multiline duplicate
	printf "bin\0data\n" > "$TEST_DIR/binfile"
	cp "$TEST_DIR/binfile" "$TEST_DIR/subdir1/binfile_copy"

	# A file that is a symlink (should not deduplicate)
	ln -s unique.txt "$TEST_DIR/link_to_unique"

	echo "Created test files:"
	find "$TEST_DIR" -exec ls -l {} +

	deduplicate_files "$TEST_DIR"

	echo
	echo "After deduplication:"
	find "$TEST_DIR" -type f -o -type l | while IFS= read -r file; do
		ls -l "$file"
	done

	# Helper: verify all files in group point to a single real file
	verify_group() {
		local group_name="$1"; shift
		local -a files=("$@")
		local targets=()
		for f in "${files[@]}"; do
			if [ ! -e "$f" ]; then
				echo "FAIL: $group_name $f missing"
				PASS=false
			elif [ -L "$f" ]; then
				targets+=("$(readlink -f "$f")")
			else
				targets+=("$(readlink -f "$f")")
			fi
		done
		targets=($(printf "%s\n" "${targets[@]}" | sort -u))
		if [ "${#targets[@]}" -ne 1 ]; then
			echo "FAIL: Not all $group_name files are linked to a single file: ${targets[*]}"
			PASS=false
		fi
	}

	# Group "a" verification (including hardlink)
	verify_group "a" "$TEST_DIR/file1.txt" "$TEST_DIR/file9.txt" "$TEST_DIR/subdir1/file2.txt" "$TEST_DIR/subdir1/file5.txt" "$TEST_DIR/hardlink_to_a"

	# Group "b"
	verify_group "b" "$TEST_DIR/subdir2/file3.txt" "$TEST_DIR/subdir3/file4.txt" "$TEST_DIR/deeper/inner/fileb.txt"

	# Group "large"
	verify_group "large" "$TEST_DIR/large.bin" "$TEST_DIR/subdir2/large_copy.bin"

	# Group "empty"
	verify_group "empty" "$TEST_DIR/emptyfile" "$TEST_DIR/subdir1/emptyfile_copy"

	# Group "bin"
	verify_group "bin" "$TEST_DIR/binfile" "$TEST_DIR/subdir1/binfile_copy"

	# Unique file untouched and not a symlink
	if [ ! -f "$TEST_DIR/unique.txt" ]; then
		echo "FAIL: unique.txt missing"
		PASS=false
	elif [ -L "$TEST_DIR/unique.txt" ]; then
		echo "FAIL: unique.txt is a symlink"
		PASS=false
	fi

	# Symlink to unique should still be a symlink and point to unique
	if [ ! -L "$TEST_DIR/link_to_unique" ]; then
		echo "FAIL: link_to_unique should be a symlink"
		PASS=false
	else
		link_target=$(readlink -f "$TEST_DIR/link_to_unique")
		unique_target=$(readlink -f "$TEST_DIR/unique.txt")
		if [ "$link_target" != "$unique_target" ]; then
			echo "FAIL: link_to_unique does not point to unique.txt"
			PASS=false
		fi
	fi

	if $PASS; then
		echo "TEST RESULT: ✅ PASS"
	else
		echo "TEST RESULT: ❌ FAIL"
		exit 1
	fi

	rm -rf "$TEST_DIR"
}

print_summary() {
	local file_count="$1"
	local total_bytes="$2"
	local duplicate_count="$3"
	local total_saved="$4"
	local dry_run="$5"

	local COLOR_HEADER; local COLOR_RESET="\033[0m"
	if [[ "$dry_run" == true ]]; then
		COLOR_HEADER="\033[1;36m"
	else
		COLOR_HEADER="\033[1;32m"
	fi

	printf "\n"

	if [[ "$dry_run" == true ]]; then
		if $EMOJI_OK; then
			printf "${COLOR_HEADER}  %s  %s${COLOR_RESET}\n" "$ICON_DR" "DRY-RUN SUMMARY"
		else
			printf "${COLOR_HEADER}===== DRY-RUN SUMMARY =====${COLOR_RESET}\n"
		fi
	else
		if $EMOJI_OK; then
			printf "${COLOR_HEADER}  %s  %s${COLOR_RESET}\n" "DEDUPLICATION SUMMARY"
		else
			printf "${COLOR_HEADER}===== DEDUPLICATION SUMMARY =====${COLOR_RESET}\n"
		fi
	fi

	local LABEL_WIDTH=27
	local VALUE_WIDTH=12

	if [[ "$dry_run" == true ]]; then
		printf "  %s  %-*s %*d\n" "$ICON_FILE"  $LABEL_WIDTH "Total files scanned:"           $VALUE_WIDTH "$file_count"
		printf "  %s  %-*s %*d\n" "$ICON_BYTE"  $LABEL_WIDTH "Total bytes scanned:"           $VALUE_WIDTH "$total_bytes"
		printf "  %s  %-*s %*d\n" "$ICON_DUP"   $LABEL_WIDTH "Duplicate files found:"         $VALUE_WIDTH "$duplicate_count"
		printf "  %s  %-*s %*d bytes\n" "$ICON_SAVE" $LABEL_WIDTH "Space that would be saved:" $VALUE_WIDTH "$total_saved"
	else
		printf "  %s  %-*s %*d\n" "$ICON_FILE"  $LABEL_WIDTH "Total files scanned:"           $VALUE_WIDTH "$file_count"
		printf "  %s  %-*s %*d\n" "$ICON_BYTE"  $LABEL_WIDTH "Total bytes scanned:"           $VALUE_WIDTH "$total_bytes"
		printf "  %s  %-*s %*d\n" "$ICON_DUP"   $LABEL_WIDTH "Duplicate files replaced:"      $VALUE_WIDTH "$duplicate_count"
		printf "  %s  %-*s %*d bytes\n" "$ICON_SAVE" $LABEL_WIDTH "Space actually saved:"     $VALUE_WIDTH "$total_saved"
	fi

	printf "\n"
}


deduplicate_files() {
	local target_dir="$1"
	local cpu_count
	cpu_count="$(detect_cpu_count)"

	echo "Scanning files in: $target_dir"

	local file_list
	file_list="$(mktemp)"
	find "$target_dir" -type f -printf '%s\t%p\0' > "$file_list"

	if [[ ! -s "$file_list" ]]; then
		echo "No regular files found. Nothing to deduplicate."
		rm -f "$file_list"
		return
	fi

	declare -A size_map
	local total_bytes=0
	local file_count=0

	while IFS= read -r -d '' line; do
		local fsize fpath
		fsize="${line%%$'\t'*}"
		fpath="${line#*$'\t'}"

		(( file_count++ ))
		(( total_bytes += fsize ))

		if [[ -z "${size_map["$fsize"]+set}" ]]; then
			size_map["$fsize"]="$fpath"
		else
			size_map["$fsize"]="${size_map["$fsize"]}"$'\n'"$fpath"
		fi
	done < "$file_list"

	rm -f "$file_list"

	local total_saved=0
	local duplicate_count=0

	for fsize in "${!size_map[@]}"; do
		IFS=$'\n' read -r -d '' -a file_array < <(printf '%s\0' "${size_map[$fsize]}")

		(( "${#file_array[@]}" > 1 )) || continue

		local hash_input
		hash_input="$(mktemp)"
		printf '%s\0' "${file_array[@]}" > "$hash_input"

		local hash_output
		hash_output="$(mktemp)"
		xargs -0 -n1 -P "$cpu_count" b3sum < "$hash_input" > "$hash_output"
		rm -f "$hash_input"

		declare -A hash_map=()
		while IFS= read -r line; do
			local hval="${line:0:64}"
			local remainder="${line:64}"

			remainder="${remainder#"${remainder%%[![:space:]]*}"}"

			local fpath="$remainder"

			if [[ -n "${hash_map[$hval]:-}" ]]; then
				local original="${hash_map[$hval]}"
				local rel_path
				rel_path="$(realpath --relative-to="$(dirname "$fpath")" "$original")"

				if [[ "$DRY_RUN" == true ]]; then
					echo "[DRY-RUN] Would remove '$fpath' (size: $fsize) and symlink -> '$rel_path'"
				else
					rm -f "$fpath"
					ln -s "$rel_path" "$fpath"
				fi

				(( duplicate_count++ ))
				(( total_saved += fsize ))
			else
				hash_map["$hval"]="$fpath"
			fi
		done < "$hash_output"

		rm -f "$hash_output"
	done

	echo
	print_summary "$file_count" "$total_bytes" "$duplicate_count" "$total_saved" "$DRY_RUN"
}

main() {
	local positionals=()

	while [[ $# -gt 0 ]]; do
		case $1 in
			-h|--help)
				print_help
				exit 0
				;;
			-T|--test)
				setup_test_environment
				exit 0
				;;
			-n|--dry-run)
				DRY_RUN=true
				shift
				;;
			*)
				positionals+=("$1")
				shift
				;;
		esac
	done

	if [[ ${#positionals[@]} -gt 1 ]]; then
		print_help
		exit 1
	fi

	if [[ ${#positionals[@]} -eq 1 ]]; then
		ROOT_DIR="${positionals[0]}"
	fi

	check_dependencies
	deduplicate_files "$ROOT_DIR"
}

main "$@"
