root/trunk/scripts/url_syntax.php

Revision 1539, 9.8 KB (checked in by jeena, 3 years ago)

Added more plugin interfaces

Line 
1<?php
2
3function check_url ($url2check, $types) {
4
5# Be paranoid about using grouping!
6
7$nz_digit       =  '[1-9]';
8$nz_digits      =  "(?:$nz_digit\\d*)";
9$digits         =  '(?:\d+)';
10$space          =  '(?:%20)';
11$nl             =  '(?:%0[Aa])';
12$dot            =  '\.';
13$plus           =  '\+';
14$qm             =  '\?';
15$ast            =  '\*';
16$hex            =  '[a-fA-F\d]';
17$alpha          =  '[a-zA-Z]';     # No, no locale.
18$alphas         =  "(?:${alpha}+)";
19$alphanum       =  '[a-zA-Z\d]';   # Letter or digit.
20$xalphanum      =  "(?:${alphanum}|%(?:3\\d|[46]$hex|[57][Aa\\d]))";
21                      # Letter or digit, or hex escaped letter/digit.
22$alphanums      =  "(?:${alphanum}+)";
23$escape         =  "(?:%$hex\{2})";
24$safe           =  '[$\-_.+]';
25$extra          =  "[\!*'(),]";
26$national       =  '[{}|\\^~[\]`]';
27$punctuation    =  '[<>#%"]';
28$reserved       =  '[;/?:@&=]';
29$uchar          =  "(?:${alphanum}|${safe}|${extra}|${escape})";
30$xchar          =  "(?:${alphanum}|${safe}|${extra}|${reserved}|${escape})";
31$uchar          =  str_replace (']|[', '', $uchar);  // Make string smaller, and speed up regex.
32$uchar          =  str_replace (']|[', '', $xchar);  // Make string smaller, and speed up regex.
33
34# URL schemeparts for ip based protocols:
35$user           =  "(?:(?:${uchar}|[;?&=])*)";
36$password       =  "(?:(?:${uchar}|[;?&=])*)";
37$hostnumber     =  "(?:${digits}(?:${dot}${digits}){3})";
38$toplabel       =  "(?:${alpha}(?:(?:${alphanum}|-)*${alphanum})?)";
39$domainlabel    =  "(?:${alphanum}(?:(?:${alphanum}|-)*${alphanum})?)";
40$hostname       =  "(?:(?:${domainlabel}${dot})*${toplabel})";
41$host           =  "(?:${hostname}|${hostnumber})";
42$hostport       =  "(?:${host}(?::${digits})?)";
43$login          =  "(?:(?:${user}(?::${password})?\@)?${hostport})";
44
45# The predefined schemes:
46
47# FTP (see also RFC959)
48$fsegment       =  "(?:(?:${uchar}|[?:\@&=])*)";
49$fpath          =  "(?:${fsegment}(?:/${fsegment})*)";
50$ftpurl         =  "(?:ftp://${login}(?:/${fpath}(?:;type=[AIDaid])?)?)";
51
52# FILE
53$fileurl        =  "(?:file://(?:${host}|localhost)?/${fpath})";
54
55# HTTP
56$hsegment       =  "(?:(?:${uchar}|[~;:\@&=])*)";
57$search         =  "(?:(?:${uchar}|[;:\@&=])*)";
58$hpath          =  "(?:${hsegment}(?:/${hsegment})*)";
59$httpurl        =  "(?:https?://${hostport}(?:/${hpath}(?:${qm}${search})?)?)";
60
61# GOPHER (see also RFC1436)
62$gopher_plus    =  "(?:${xchar}*)";
63$selector       =  "(?:${xchar}*)";
64$gtype          =      $xchar;     // Omitted parens!
65$gopherurl      =  "(?:gopher://${hostport}(?:/${gtype}(?:${selector}" .
66                      "(?:%09${search}(?:%09${gopher_plus})?)?)?)?)";
67
68# MAILTO (see also RFC822)
69$encoded822addr =  "(?:$xchar+)";
70$mailtourl      =  "(?:mailto:$encoded822addr)";
71$mailtonpurl    =  $encoded822addr;
72
73# NEWS (see also RFC1036)
74$article        =  "(?:(?:${uchar}|[;/?:&=])+\@${host})";
75$group          =  "(?:${alpha}(?:${alphanum}|[_.+-])*)";
76$grouppart      =  "(?:${article}|${group}|${ast})";
77$newsurl        =  "(?:news:${grouppart})";
78
79# NNTP (see also RFC977)
80$nntpurl        =  "(?:nntp://${hostport}/${group}(?:/${digits})?)";
81
82# TELNET
83$telneturl      =  "(?:telnet://${login}/?)";
84
85# WAIS (see also RFC1625)
86$wpath          =  "(?:${uchar}*)";
87$wtype          =  "(?:${uchar}*)";
88$database       =  "(?:${uchar}*)";
89$waisdoc        =  "(?:wais://${hostport}/${database}/${wtype}/${wpath})";
90$waisindex      =  "(?:wais://${hostport}/${database}${qm}${search})";
91$waisdatabase   =  "(?:wais://${hostport}/${database})";
92# $waisurl        =  "(?:${waisdatabase}|${waisindex}|${waisdoc})";
93# Speed up: the 3 types share a common prefix.
94$waisurl        =  "(?:wais://${hostport}/${database}" .
95                         "(?:(?:/${wtype}/${wpath})|${qm}${search})?)";
96
97# PROSPERO
98$fieldvalue     =  "(?:(?:${uchar}|[?:\@&])*)";
99$fieldname      =  "(?:(?:${uchar}|[?:\@&])*)";
100$fieldspec      =  "(?:;${fieldname}=${fieldvalue})";
101$psegment       =  "(?:(?:${uchar}|[?:\@&=])*)";
102$ppath          =  "(?:${psegment}(?:/${psegment})*)";
103$prosperourl    =  "(?:prospero://${hostport}/${ppath}(?:${fieldspec})*)";
104
105# LDAP (see also RFC1959)
106# First. import stuff from RFC 1779 (Distinguished Names).
107# We've modified things a bit.
108$dn_separator        =  "(?:[;,])";
109$dn_optional_space   =  "(?:${nl}?${space}*)";
110$dn_spaced_separator =  "(?:${dn_optional_space}${dn_separator}" .
111                              "${dn_optional_space})";
112$dn_oid              =  "(?:${digits}(?:${dot}${digits})*)";
113$dn_keychar          =  "(?:${xalphanum}|${space})";
114$dn_key              =  "(?:${dn_keychar}+|(?:OID|oid)${dot}${dn_oid})";
115$dn_string           =  "(?:${uchar}*)";
116$dn_attribute        =  "(?:(?:${dn_key}${dn_optional_space}=" .
117                                          "${dn_optional_space})?${dn_string})";
118$dn_name_component   =  "(?:${dn_attribute}(?:${dn_optional_space}" .
119                              "${plus}${dn_optional_space}${dn_attribute})*)";
120$dn_name             =  "(?:${dn_name_component}" .
121                            "(?:${dn_spaced_separator}${dn_name_component})*" .
122                               "${dn_spaced_separator}?)";
123
124# RFC 1558 defines the filter syntax, but that requires a PDA to recognize.
125# Since that's too powerful for Perl's REs, we allow any char between the
126# parenthesis (which have to be there.)
127$ldap_filter         =  "(?:\(${xchar}+\))";
128
129# This is from RFC 1777. It defines an attributetype as an 'OCTET STRING',
130# whatever that is.
131$ldap_attr_type      =  "(?:${uchar}+)";  # I'm just guessing here.
132                                             # The RFCs aren't clear.
133
134# Now we are at the grammar of RFC 1959.
135$ldap_attr_list =  "(?:${ldap_attr_type}(?:,${ldap_attr_type})*)";
136$ldap_attrs     =  "(?:${ldap_attr_list}?)";
137
138$ldap_scope     =  "(?:base|one|sub)";
139$ldapurl        =  "(?:ldap://(?:${hostport})?/${dn_name}" .
140                        "(?:${qm}${ldap_attrs}" .
141                        "(?:${qm}${ldap_scope}(?:${qm}${ldap_filter})?)?)?)";
142
143
144# RFC 2056 defines the format of URLs for the Z39.50 protocol.
145$z_database     =  "(?:${uchar}+)";
146$z_docid        =  "(?:${uchar}+)";
147$z_elementset   =  "(?:${uchar}+)";
148$z_recordsyntax =  "(?:${uchar}+)";
149$z_scheme       =  "(?:z39${dot}50[rs])";
150$z39_50url      =  "(?:${z_scheme}://${hostport}" .
151                           "(?:/(?:${z_database}(?:${plus}${z_database})*" .
152                                   "(?:${qm}${z_docid})?)?" .
153                               "(?:;esn=${z_elementset})?" .
154                               "(?:;rs=${z_recordsyntax}" .
155                                    "(?:${plus}${z_recordsyntax})*)?))";
156
157
158# RFC 2111 defines the format for cid/mid URLs.
159$url_addr_spec  =  "(?:(?:${uchar}|[;?:@&=])*)";
160$message_id     =  $url_addr_spec;
161$content_id     =  $url_addr_spec;
162$cidurl         =  "(?:cid:${content_id})";
163$midurl         =  "(?:mid:${message_id}(?:/${content_id})?)";
164
165
166# RFC 2122 defines the Vemmi URLs.
167$vemmi_attr     =  "(?:(?:${uchar}|[/?:@&])*)";
168$vemmi_value    =  "(?:(?:${uchar}|[/?:@&])*)";
169$vemmi_service  =  "(?:(?:${uchar}|[/?:@&=])*)";
170$vemmi_param    =  "(?:;${vemmi_attr}=${vemmi_value})";
171$vemmiurl       =  "(?:vemmi://${hostport}" . 
172                          "(?:/${vemmi_service}(?:${vemmi_param}*))?)";
173
174# RFC 2192 for IMAP URLs.
175# Import from RFC 2060.
176# $imap4_astring       =  "";
177# $imap4_search_key    =  "";
178# $imap4_section_text  =  "";
179$imap4_nz_number     =  $nz_digits;
180$achar          =  "(?:${uchar}|[&=~])";
181$bchar          =  "(?:${uchar}|[&=~:\@/])";
182$enc_auth_type  =  "(?:${achar}+)";
183$enc_list_mbox  =  "(?:${bchar}+)";
184$enc_mailbox    =  "(?:${bchar}+)";
185$enc_search     =  "(?:${bchar}+)";
186$enc_section    =  "(?:${bchar}+)";
187$enc_user       =  "(?:${achar}+)";
188$i_auth         =  "(?:;[Aa][Uu][Tt][Hh]=(?:${ast}|${enc_auth_type}))";
189$i_list_type    =  "(?:[Ll](?:[Ii][Ss][Tt]|[Ss][Uu][Bb]))";
190$i_mailboxlist  =  "(?:${enc_list_mbox}?;[Tt][Yy][Pp][Ee]=${i_list_type})";
191$i_uidvalidity  =  "(?:;[Uu][Ii][Dd][Vv][Aa][Ll][Ii][Dd][Ii][Tt][Yy]=" .
192                          "${imap4_nz_number})";
193$i_messagelist  =  "(?:${enc_mailbox}(?:${qm}${enc_search})?" .
194                                       "(?:${i_uidvalidity})?)";
195$i_section      =  "(?:/;[Ss][Ee][Cc][Tt][Ii][Oo][Nn]=${enc_section})";
196$i_uid          =  "(?:/;[Uu][Ii][Dd]=${imap4_nz_number})";
197$i_messagepart  =  "(?:${enc_mailbox}(?:${i_uidvalidity})?${i_uid}" .
198                                       "(?:${i_section})?)";
199$i_command      =  "(?:${i_mailboxlist}|${i_messagelist}|${i_messagepart})";
200$i_userauth     =  "(?:(?:${enc_user}(?:${i_auth})?)|" .
201                         "(?:${i_auth}(?:${enc_user})?))";
202$i_server       =  "(?:(?:${i_userauth}\@)?${hostport})";
203$imapurl        =  "(?:imap://${i_server}/(?:$i_command)?)";
204
205# RFC 2224 for NFS.
206$nfs_mark       =  '[\$\-_.\!~*\'(),]';
207$nfs_unreserved =  "(?:${alphanum}|${nfs_mark})";
208$nfs_unreserved =  str_replace (']|[', '', $nfs_unreserved);  // Make string smaller, and speed up regex.
209$nfs_pchar      =  "(?:${nfs_unreserved}|${escape}|[:\@&=+])";
210$nfs_segment    =  "(?:${nfs_pchar}*)";
211$nfs_path_segs  =  "(?:${nfs_segment}(?:/${nfs_segment})*)";
212$nfs_url_path   =  "(?:/?${nfs_path_segs})";
213$nfs_rel_path   =  "(?:${nfs_path_segs}?)";
214$nfs_abs_path   =  "(?:/${nfs_rel_path})";
215$nfs_net_path   =  "(?://${hostport}(?:${nfs_abs_path})?)";
216$nfs_rel_url    =  "(?:${nfs_net_path}|${nfs_abs_path}|${nfs_rel_path})";
217$nfsurl         =  "(?:nfs:${nfs_rel_url})";
218
219$valid_types = array (
220  'http', 'ftp', 'news', 'nntp', 'telnet', 'gopher', 'wais', 'mailto',
221  'mailtonp', 'file', 'prospero', 'ldap', 'z39_50', 'cid', 'mid', 'vemmi',
222  'imap', 'nfs'
223);
224
225# Combining all the different URL formats into a single regex.
226
227$valid = false;
228
229if (!is_array ($types)) {
230    $types = array ($types);
231}
232
233foreach ($types as $type) {
234    if (!in_array ($type, $valid_types)) {
235        continue;
236    }
237    $re = $type.'url';
238    if (preg_match ('!^'.$$re.'$!i', $url2check)) {
239        $valid = $type;
240        break;
241    }
242}
243
244return $valid;
245
246}
247
248?>
Note: See TracBrowser for help on using the browser.