diff options
author | epoch <epoch@hacking.allowed.org> | 2019-04-20 05:32:27 -0500 |
---|---|---|
committer | epoch <epoch@hacking.allowed.org> | 2019-04-20 05:32:27 -0500 |
commit | 6f402e2d2f052972886712f60d592684c8671982 (patch) | |
tree | 47a09324bd3c5e577ec5b7059bd6c8834bead115 /uri.h | |
parent | d42135919f480c8bba4ca1f043fbabf44dac708f (diff) | |
download | uritools-6f402e2d2f052972886712f60d592684c8671982.tar.gz uritools-6f402e2d2f052972886712f60d592684c8671982.zip |
rebased on an old copy of this repo. renamed everything. rewrote the uri parser. added uricmp. wew.
Diffstat (limited to 'uri.h')
-rw-r--r-- | uri.h | 253 |
1 files changed, 253 insertions, 0 deletions
@@ -0,0 +1,253 @@ +#ifndef uri_H +#define uri_H + +#define _XOPEN_SOURCE 500 //for strdup +#include <string.h> +#include <netdb.h> +#include <stdio.h> +#include <stdlib.h> + +//uri_reserved = gen-delims / sub-delims +#define pe_gen_delims ":/?#[]@" +#define pe_sub_delims "!$&'()*+,;=" +//char *pe_reserved[]=pe_gen_delims "" pe_sub_delims; +#define pe_ALPHA "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +#define pe_DIGIT "0123456789" +#define pe_HPUT "-._~" +//char *pe_unreserved[]=pe_ALPHA "" pe_DIGIT "" pe_HPUT; + +unsigned char rfc3086_percent_encoding[256]; + +#define isxdigit(a) ((a >= 'a' && a <= 'f') || (a >= '0' && a <= '9') || (a >= 'A' && a <= 'F')) +#define toupper(a) ((a >= 'a' && a <= 'z')?a-' ':a) + +char *uri_reserved={ + pe_gen_delims + pe_sub_delims + pe_ALPHA + pe_DIGIT + pe_HPUT +}; + +int uriescapelength(char *in,int len) { + int rlen=0;//be sure to add one to this return value if you plan on putting a null byte at the end. + int i; + for(i=0;i<len;i++) { + rlen+=strchr(uri_reserved,in[i])?1:3; + } + return rlen; +} + +// make sure your out char * has enough space! use uriescapelength for it. +void uriescape(char *in,char *out,int len) { + int i; + int j; + for(i=0,j=0;i<len;i++) { + if(strchr(uri_reserved,in[i])) { + out[j]=in[i]; + j++; + } else { + out[j]='%'; + j++; + out[j]="0123456789ABCDEF"[(in[i] >> 4 & 0x15)]; + j++; + out[j]="0123456789ABCDEF"[(in[i] % 16)]; + j++; + } + } +} + +int uriunescape(char *in,char *out) { + char *o=out; + char *t; + char a,b; + char *s=in; + if(!strchr(s,'%')) memmove(out,in,strlen(in)); + while((t=strchr(s,'%'))) { + if(t-s) {//if there are actually bytes to copy. + memmove(o,s,t-s); + o+=(t-s); + s+=(t-s); + } + if(isxdigit(t[1]) && isxdigit(t[2])) { + s+=3;//skip the %XX + a=toupper(t[1]); + b=toupper(t[2]); + *o=((a-'0'<10 ? a-'0' : a-'A'+10) << 4) + (b-'0'<10 ? b-'0' : b-'A'+10); + o++; + } else { + s++;//skip just the %. the next character might be a % //TODO: look up what the "right" thing to do here is. + *o='%'; + o++; + } + } + //copy the last part. + memmove(o,s,strlen(s)); + o[strlen(s)]=0; + return o+strlen(s)-out; +} + +struct uri {//warning. it is technically undefined behavior to set one half of a union then use the other half. + union { + char *A[8]; + struct { + union { char *s;char *scheme; }; + union { char *u;char *username; }; + union { char *k;char *password; }; + union { char *d;char *domain; }; + union { char *P;char *port; }; + union { char *p;char *path; }; + union { char *q;char *query_string; }; + union { char *f;char *fragment_id; }; + }; + }; +}; + +//returns 0 on success, returns a byte with bits set for non-matching pieces. +unsigned int uricmp(struct uri *a,struct uri *b) { + int i; + int ret=0; + for(i=0;i<8;i++) { + if(a->A[i] && !b->A[i]) ret |=(1<<(i+8));//we have a's but not b's + if(!a->A[i] && b->A[i]) ret |=(1<<(i+16)); + //for testing if(!a->A[i] && !b->A[i]) ret |=(1<<(i+24));//no problem here. both empty. + if(a->A[i] && b->A[i]) { + if(strcmp(a->A[i],b->A[i])) { + ret|=(1<<i); + } + } + } + return ret; +} + +/* + schemes are case sensitive but cononicals are lower case. + domain is case insensitive. return it lowercased? + port is optional and in decimal + path + scheme://username:password@domain:port/path?query_string#fragment_id + mailto:username@domain + + optional stuff: + scheme, username, password, port, path, query_string, fragment_id +*/ + +//should it be a dick about what characters are allowed? +//should it just try to ignore weird shit? + +//return 0 on fail //not sure what this means. +//return 1 on success +int urifromline(struct uri *u,char *line) { + //these first two are easy. the rest... not so much. + char *t; +// memset(u,0,sizeof(struct uri)); //this function shouldn't do this. + if((u->fragment_id=strchr(line,'#'))) { + *u->fragment_id=0; + u->fragment_id++; + } + if((u->query_string=strchr(line,'?'))) { + *u->query_string=0; + u->query_string++; + } + //now we have scheme, user, pass, domain, port, and path. maybe. + //what character can we split on now? : is a terrible choice. + // how about /? first / is either a separator between scheme + //could find the first non-scheme character. + //so we might have... scheme://user:pass@host:port/path + //or... user:pass@host:port/path ? + //we need to do this based on /s + // we're either going to find the scheme and authority separator + // or we're going to find the start of a path. + //there: scheme:/path, scheme://host (empty path), or scheme:path/morepath + //or... should we do paths without + //scheme must start with a-z +/* if(*line == '/' && *(line+1) != '/') { //we have a relative path. /like:this.maybe + u->path=line; + return;//we're done. nothing else to do. + } + if(*line == '.') { //we have a relative path like: ./derp or ../merp + u->path=line; + return;//we're done here. nothing else to do. + }*/ + //let's see if this starts with a scheme + if(strchr(line,':') && ((*line >= 'a' && *line <= 'z') || (*line >= 'A' && *line <= 'Z'))) { + for(u->scheme=strchr(line,':')-1;u->scheme > line;u->scheme--) { + if((*u->scheme >= 'a' && *u->scheme <= 'z') || + (*u->scheme >= 'A' && *u->scheme <= 'Z') || + (*u->scheme >= '0' && *u->scheme <= '9') || + *u->scheme == '+' || *u->scheme == '-' || *u->scheme == '.') { + //this is still a scheme. + } else { + break; + } + } + if(u->scheme == line) {//we got through the for loop alright. line starts with a scheme. + line=strchr(line,':'); + *line=0; + line++; + for(t=u->scheme;*t;t++) { + if(*t >= 'A' && *t <= 'Z') *t+=' '; + } + } + } + + //copy-pasted from above the scheme strip attempt. + if(*line == '/' && *(line+1) != '/') { //we have a relative path. /like:this.maybe + u->path=line; + return 1;//we're done. nothing else to do. + } + if(*line == '.') { //we have a relative path like: ./derp or ../merp + u->path=line; + return 1;//we're done here. nothing else to do. + } + + if(*line == '/' && line[1] == '/') {//we have an authority section. + //let's left-shift this shit over until the third / + for(t=line+1;*(t+1) && *(t+1) != '/';t++) { + *t=*(t+1); + } + *t=0; + u->path=t+1;//if there was a /, path points at it and the stuff after. + //if there wasn't a /, it points at a null byte. so "empty" + u->username=line+1; + } else { + //we have all we need. + return 1; + } + + if(u->username) {//this contains all of the authority. + if((u->domain=strchr(u->username,'@'))) {//we have user@host at least. + *u->domain=0; + u->domain++; + } else {//this isn't really a username. it is the domain. + u->domain=u->username; + u->username=0; + } + } + //if we still have u->username we try to split to user and password + if(u->username) { + if((u->password=strchr(u->username,':'))) { + *u->password=0; + u->password++; + } + } + if(u->domain) { + if((u->port=strchr(u->domain,']')) && *u->domain == '[') {//this is an IPv6 host + *u->port=0; + u->port++; + if(*u->port == ':') { + *u->port=0; + u->port++;//if it ends up being empty, whatever. that's a URI like: http://host:/path + } + } else { //we're safe to split port off at : + if((u->port=strchr(u->domain,':'))) { + *u->port=0; + u->port++; + } //there isn't a port. leave it unset. + } + } + //I dunno. + return 1; +} + +#endif |