summaryrefslogtreecommitdiff
path: root/uri.h
diff options
context:
space:
mode:
authorepoch <epoch@hacking.allowed.org>2019-04-20 05:32:27 -0500
committerepoch <epoch@hacking.allowed.org>2019-04-20 05:32:27 -0500
commit6f402e2d2f052972886712f60d592684c8671982 (patch)
tree47a09324bd3c5e577ec5b7059bd6c8834bead115 /uri.h
parentd42135919f480c8bba4ca1f043fbabf44dac708f (diff)
downloaduritools-6f402e2d2f052972886712f60d592684c8671982.tar.gz
uritools-6f402e2d2f052972886712f60d592684c8671982.zip
rebased on an old copy of this repo. renamed everything. rewrote the uri parser. added uricmp. wew.
Diffstat (limited to 'uri.h')
-rw-r--r--uri.h253
1 files changed, 253 insertions, 0 deletions
diff --git a/uri.h b/uri.h
new file mode 100644
index 0000000..97ce3c2
--- /dev/null
+++ b/uri.h
@@ -0,0 +1,253 @@
+#ifndef uri_H
+#define uri_H
+
+#define _XOPEN_SOURCE 500 //for strdup
+#include <string.h>
+#include <netdb.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+//uri_reserved = gen-delims / sub-delims
+#define pe_gen_delims ":/?#[]@"
+#define pe_sub_delims "!$&'()*+,;="
+//char *pe_reserved[]=pe_gen_delims "" pe_sub_delims;
+#define pe_ALPHA "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#define pe_DIGIT "0123456789"
+#define pe_HPUT "-._~"
+//char *pe_unreserved[]=pe_ALPHA "" pe_DIGIT "" pe_HPUT;
+
+unsigned char rfc3086_percent_encoding[256];
+
+#define isxdigit(a) ((a >= 'a' && a <= 'f') || (a >= '0' && a <= '9') || (a >= 'A' && a <= 'F'))
+#define toupper(a) ((a >= 'a' && a <= 'z')?a-' ':a)
+
+char *uri_reserved={
+ pe_gen_delims
+ pe_sub_delims
+ pe_ALPHA
+ pe_DIGIT
+ pe_HPUT
+};
+
+int uriescapelength(char *in,int len) {
+ int rlen=0;//be sure to add one to this return value if you plan on putting a null byte at the end.
+ int i;
+ for(i=0;i<len;i++) {
+ rlen+=strchr(uri_reserved,in[i])?1:3;
+ }
+ return rlen;
+}
+
+// make sure your out char * has enough space! use uriescapelength for it.
+void uriescape(char *in,char *out,int len) {
+ int i;
+ int j;
+ for(i=0,j=0;i<len;i++) {
+ if(strchr(uri_reserved,in[i])) {
+ out[j]=in[i];
+ j++;
+ } else {
+ out[j]='%';
+ j++;
+ out[j]="0123456789ABCDEF"[(in[i] >> 4 & 0x15)];
+ j++;
+ out[j]="0123456789ABCDEF"[(in[i] % 16)];
+ j++;
+ }
+ }
+}
+
+int uriunescape(char *in,char *out) {
+ char *o=out;
+ char *t;
+ char a,b;
+ char *s=in;
+ if(!strchr(s,'%')) memmove(out,in,strlen(in));
+ while((t=strchr(s,'%'))) {
+ if(t-s) {//if there are actually bytes to copy.
+ memmove(o,s,t-s);
+ o+=(t-s);
+ s+=(t-s);
+ }
+ if(isxdigit(t[1]) && isxdigit(t[2])) {
+ s+=3;//skip the %XX
+ a=toupper(t[1]);
+ b=toupper(t[2]);
+ *o=((a-'0'<10 ? a-'0' : a-'A'+10) << 4) + (b-'0'<10 ? b-'0' : b-'A'+10);
+ o++;
+ } else {
+ s++;//skip just the %. the next character might be a % //TODO: look up what the "right" thing to do here is.
+ *o='%';
+ o++;
+ }
+ }
+ //copy the last part.
+ memmove(o,s,strlen(s));
+ o[strlen(s)]=0;
+ return o+strlen(s)-out;
+}
+
+struct uri {//warning. it is technically undefined behavior to set one half of a union then use the other half.
+ union {
+ char *A[8];
+ struct {
+ union { char *s;char *scheme; };
+ union { char *u;char *username; };
+ union { char *k;char *password; };
+ union { char *d;char *domain; };
+ union { char *P;char *port; };
+ union { char *p;char *path; };
+ union { char *q;char *query_string; };
+ union { char *f;char *fragment_id; };
+ };
+ };
+};
+
+//returns 0 on success, returns a byte with bits set for non-matching pieces.
+unsigned int uricmp(struct uri *a,struct uri *b) {
+ int i;
+ int ret=0;
+ for(i=0;i<8;i++) {
+ if(a->A[i] && !b->A[i]) ret |=(1<<(i+8));//we have a's but not b's
+ if(!a->A[i] && b->A[i]) ret |=(1<<(i+16));
+ //for testing if(!a->A[i] && !b->A[i]) ret |=(1<<(i+24));//no problem here. both empty.
+ if(a->A[i] && b->A[i]) {
+ if(strcmp(a->A[i],b->A[i])) {
+ ret|=(1<<i);
+ }
+ }
+ }
+ return ret;
+}
+
+/*
+ schemes are case sensitive but cononicals are lower case.
+ domain is case insensitive. return it lowercased?
+ port is optional and in decimal
+ path
+ scheme://username:password@domain:port/path?query_string#fragment_id
+ mailto:username@domain
+
+ optional stuff:
+ scheme, username, password, port, path, query_string, fragment_id
+*/
+
+//should it be a dick about what characters are allowed?
+//should it just try to ignore weird shit?
+
+//return 0 on fail //not sure what this means.
+//return 1 on success
+int urifromline(struct uri *u,char *line) {
+ //these first two are easy. the rest... not so much.
+ char *t;
+// memset(u,0,sizeof(struct uri)); //this function shouldn't do this.
+ if((u->fragment_id=strchr(line,'#'))) {
+ *u->fragment_id=0;
+ u->fragment_id++;
+ }
+ if((u->query_string=strchr(line,'?'))) {
+ *u->query_string=0;
+ u->query_string++;
+ }
+ //now we have scheme, user, pass, domain, port, and path. maybe.
+ //what character can we split on now? : is a terrible choice.
+ // how about /? first / is either a separator between scheme
+ //could find the first non-scheme character.
+ //so we might have... scheme://user:pass@host:port/path
+ //or... user:pass@host:port/path ?
+ //we need to do this based on /s
+ // we're either going to find the scheme and authority separator
+ // or we're going to find the start of a path.
+ //there: scheme:/path, scheme://host (empty path), or scheme:path/morepath
+ //or... should we do paths without
+ //scheme must start with a-z
+/* if(*line == '/' && *(line+1) != '/') { //we have a relative path. /like:this.maybe
+ u->path=line;
+ return;//we're done. nothing else to do.
+ }
+ if(*line == '.') { //we have a relative path like: ./derp or ../merp
+ u->path=line;
+ return;//we're done here. nothing else to do.
+ }*/
+ //let's see if this starts with a scheme
+ if(strchr(line,':') && ((*line >= 'a' && *line <= 'z') || (*line >= 'A' && *line <= 'Z'))) {
+ for(u->scheme=strchr(line,':')-1;u->scheme > line;u->scheme--) {
+ if((*u->scheme >= 'a' && *u->scheme <= 'z') ||
+ (*u->scheme >= 'A' && *u->scheme <= 'Z') ||
+ (*u->scheme >= '0' && *u->scheme <= '9') ||
+ *u->scheme == '+' || *u->scheme == '-' || *u->scheme == '.') {
+ //this is still a scheme.
+ } else {
+ break;
+ }
+ }
+ if(u->scheme == line) {//we got through the for loop alright. line starts with a scheme.
+ line=strchr(line,':');
+ *line=0;
+ line++;
+ for(t=u->scheme;*t;t++) {
+ if(*t >= 'A' && *t <= 'Z') *t+=' ';
+ }
+ }
+ }
+
+ //copy-pasted from above the scheme strip attempt.
+ if(*line == '/' && *(line+1) != '/') { //we have a relative path. /like:this.maybe
+ u->path=line;
+ return 1;//we're done. nothing else to do.
+ }
+ if(*line == '.') { //we have a relative path like: ./derp or ../merp
+ u->path=line;
+ return 1;//we're done here. nothing else to do.
+ }
+
+ if(*line == '/' && line[1] == '/') {//we have an authority section.
+ //let's left-shift this shit over until the third /
+ for(t=line+1;*(t+1) && *(t+1) != '/';t++) {
+ *t=*(t+1);
+ }
+ *t=0;
+ u->path=t+1;//if there was a /, path points at it and the stuff after.
+ //if there wasn't a /, it points at a null byte. so "empty"
+ u->username=line+1;
+ } else {
+ //we have all we need.
+ return 1;
+ }
+
+ if(u->username) {//this contains all of the authority.
+ if((u->domain=strchr(u->username,'@'))) {//we have user@host at least.
+ *u->domain=0;
+ u->domain++;
+ } else {//this isn't really a username. it is the domain.
+ u->domain=u->username;
+ u->username=0;
+ }
+ }
+ //if we still have u->username we try to split to user and password
+ if(u->username) {
+ if((u->password=strchr(u->username,':'))) {
+ *u->password=0;
+ u->password++;
+ }
+ }
+ if(u->domain) {
+ if((u->port=strchr(u->domain,']')) && *u->domain == '[') {//this is an IPv6 host
+ *u->port=0;
+ u->port++;
+ if(*u->port == ':') {
+ *u->port=0;
+ u->port++;//if it ends up being empty, whatever. that's a URI like: http://host:/path
+ }
+ } else { //we're safe to split port off at :
+ if((u->port=strchr(u->domain,':'))) {
+ *u->port=0;
+ u->port++;
+ } //there isn't a port. leave it unset.
+ }
+ }
+ //I dunno.
+ return 1;
+}
+
+#endif