blob: d2c9d363e58caf4ebc4cec2113baadcabbc12247 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
|
#!/bin/sh
chunk=10
dl_safe()
{
echo "$1 <- $2"
if (echo $1 | grep '/' >/dev/null) && ! [ -d ${1%/*} ]
then
mkdir -p ${1%/*}
fi
done=-1
i=0
limit=10
while [ $done -ne 0 ] && [ $i -lt $limit ]
do
curl --compressed --max-time 10 -sLo $1 $2 -H @$3
done=$?
i=$((i+1))
done
if [ $i -eq $limit ]
then
echo Tried $limit times, bad URL.
fi
}
if [ -z "$outdir" ]
then
outdir=page
fi
if [ -z "$depth" ]
then
depth=0
fi
strip_path()
{
sed -e 's|/$||' -e "s|?.*||" | (
if [ $depth -eq 1 ]
then
sed -E -e "s|.*/([^/]*/[^/]*)$|\1|"
else
sed -e "s|.*/|$outdir/|"
fi
)
if ! [ -d $outdir ]
then
mkdir -p $outdir
fi
}
concurrent_dl()
{
in="$(cat | tr ' ' '\n')"
filter=$1
if [ -z $filter ]
then
filter=strip_path
fi
headers=$2
if [ -z $headers ]
then
echo No headers, no good.
return
fi
n=$(printf '%s\n' "$in" | wc -l)
r=$((n%10))
l=$((n-r))
i=0
printf '%s\n' "$in" | (
while [ $i -lt $l ]
do
j=$i
i=$((i+$chunk))
pids=""
while [ $j -lt $i ]
do
read url
o=$(echo "$url" | $filter)
if ! [ -f $o ]
then
dl_safe $o $url $headers &
pids="$pids $!"
fi
j=$((j+1))
done
for pid in $pids
do
echo waiting on $pid
wait $pid
done
done
i=0
while [ $i -lt $r ]
do
pids=""
read url
o=$(echo "$url" | $filter)
if ! [ -f $o ]
then
dl_safe $o $url $headers &
pids="$pids $!"
fi
i=$((i+1))
done
for pid in $pids
do
echo waiting on $pid
wait $pid
done
)
}
|