summaryrefslogtreecommitdiff
path: root/concurrent_dl.sh
blob: d2c9d363e58caf4ebc4cec2113baadcabbc12247 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/bin/sh

chunk=10
dl_safe()
{
	echo "$1 <- $2"
	if (echo $1 | grep '/' >/dev/null) && ! [ -d ${1%/*} ]
	then
		mkdir -p ${1%/*}
	fi
	

	done=-1
	i=0
	limit=10
	while [ $done -ne 0 ] && [ $i -lt $limit ]
	do
		curl --compressed --max-time 10 -sLo $1 $2 -H @$3
		done=$?
		i=$((i+1))
	done
	if [ $i -eq $limit ]
	then
		echo Tried $limit times, bad URL.
	fi
}
if [ -z "$outdir" ]
then
	outdir=page
fi
if [ -z "$depth" ]
then
	depth=0
fi
strip_path()
{
	sed -e 's|/$||' -e "s|?.*||" | (
		if [ $depth -eq 1 ]
		then
			sed -E -e "s|.*/([^/]*/[^/]*)$|\1|"
		else
			sed -e "s|.*/|$outdir/|"
		fi
	)

	if ! [ -d $outdir ]
	then
		mkdir -p $outdir
	fi
}
concurrent_dl()
{
	in="$(cat | tr ' ' '\n')"
	filter=$1
	if [ -z $filter ]
	then
		filter=strip_path
	fi
	headers=$2
	if [ -z $headers ]
	then
		echo No headers, no good.
		return
	fi

	n=$(printf '%s\n' "$in" | wc -l)
	r=$((n%10))
	l=$((n-r))
	i=0
	printf '%s\n' "$in" | (
	while [ $i -lt $l ]
	do
		j=$i
		i=$((i+$chunk))
		pids=""

		while [ $j -lt $i ]
		do
			read url
			o=$(echo "$url" | $filter)
			if ! [ -f $o ]
			then
				dl_safe $o $url $headers &
				pids="$pids $!"
			fi
			j=$((j+1))
		done

		for pid in $pids
		do
			echo waiting on $pid
			wait $pid
		done
	done
	i=0
	while [ $i -lt $r ]
	do
		pids=""
		read url
		o=$(echo "$url" | $filter)
		if ! [ -f $o ]
		then
			dl_safe $o $url $headers &
			pids="$pids $!"
		fi
		i=$((i+1))
	done
	for pid in $pids
	do
		echo waiting on $pid
		wait $pid
	done
	)
}