Also add url_parsing to this crate.

author: Andreas Grois <andi@grois.info> 2022-10-09 14:00:38 +0200
committer: Andreas Grois <andi@grois.info> 2022-10-09 14:00:38 +0200
commit: d6d345207530ec3232d937aeee3b0c9255b33129 (patch)
tree: 0645a54d0f695092ed6c0003777d2e212a99e5e5 /src/url_parsing
parent: 5e51b706d54a26470f33d1342f4666d5aab921fc (diff)
1 files changed, 508 insertions, 0 deletions
diff --git a/src/url_parsing/mod.rs b/src/url_parsing/mod.rs
new file mode 100644
index 0000000..e0a95ad
--- /dev/null
+++ b/src/url_parsing/mod.rs
@@ -0,0 +1,508 @@
+use crate::UrlParsing;
+use std::ops::Deref;
+use std::ops::Add;
+
+impl UrlParsing {
+    /// Computes a `used_text` from an input URL according to the passed in `UrlParsing` object.
+    /// Aims to be kinda compatible to Passwordmaker Pro.
+    pub(super) fn make_used_text_from_url(&self, input : &str, ) -> String {
+        parse_url(input).filter_by_settings(self).recombine()
+    }
+
+    fn is_protocol_used(&self) -> bool{
+        match self.use_protocol{
+            crate::ProtocolUsageMode::Ignored => false,
+            crate::ProtocolUsageMode::Used
+             | crate::ProtocolUsageMode::UsedWithUndefinedIfEmpty
+             => true,
+        }
+    }
+    fn use_protocol_undefined_fallback(&self) -> bool{
+        match self.use_protocol{
+            crate::ProtocolUsageMode::Ignored
+             | crate::ProtocolUsageMode::Used
+             => false,
+            crate::ProtocolUsageMode::UsedWithUndefinedIfEmpty => true,
+        }
+    }
+}
+
+#[cfg_attr(test, derive(PartialEq, Debug, Clone))]
+struct UrlParts<'s> {
+    protocol : &'s str,
+    userinfo : &'s str, //Treating this field separate is an addition to the functionaliyt offered by PasswordMaker Pro
+    subdomain : &'s str, //this is not part of the official URI spec. But PasswordMaker Pro uses it.
+    domain: &'s str,
+    port: &'s str, //this would not need to be separated from path_query_fragment, but it's easier to parse if it's separate.
+    path_query_fragment: &'s str //we don't need to separate those. Passwordmaker doesn't either.
+}
+
+impl<'s> UrlParts<'s> {
+    fn filter_by_settings(self, settings : &UrlParsing) -> UsedUrlParts<'s>{
+        let has_protocol = settings.is_protocol_used() && !self.protocol.is_empty();
+        UsedUrlParts{
+            protocol: //PasswordMaker Pro compatibility: Protocol is handled _weird_...
+                if has_protocol { self.protocol }
+                else if settings.use_protocol_undefined_fallback() { "undefined" }
+                else { <&str>::default() },
+            protocol_separator: if has_protocol { "://" } else { <&str>::default() }, //this is again some PasswordMaker Pro weirdness...
+            userinfo: if settings.use_userinfo { self.userinfo } else { <&str>::default() },
+            subdomain: if settings.use_subdomains { self.subdomain } else { <&str>::default() },
+            domain: if settings.use_domain { self.domain } else { <&str>::default() },
+            port: if settings.use_port_path { self.port } else { <&str>::default() },
+            path_query_fragment: if settings.use_port_path { self.path_query_fragment } else { <&str>::default() },
+        }
+    }
+}
+
+#[cfg_attr(test, derive(PartialEq, Debug))]
+struct UsedUrlParts<'s> {
+    protocol : &'s str,
+    protocol_separator : &'s str,
+    userinfo : &'s str, //Treating this field separate is an addition to the functionaliyt offered by PasswordMaker Pro
+    subdomain : &'s str, //this is not part of the official URI spec. But PasswordMaker Pro uses it.
+    domain: &'s str,
+    port: &'s str, //this would not need to be separated from path_query_fragment, but it's easier to parse if it's separate.
+    path_query_fragment: &'s str //we don't need to separate those. Passwordmaker doesn't either.
+}
+
+impl<'s> UsedUrlParts<'s> {
+    #[allow(clippy::doc_markdown)]
+    /// Tries to do assemble a string in a way that's at least somehow compatible to PasswordMaker Pro.
+    /// This prioritizes ease of use ("what the user expects") over correct URI parsing.
+    fn recombine(self) -> String {
+        //matching would need 64 arms... Too much work, soooo, a couple of ifs and less sanity instead.
+        let has_userinfo = !self.userinfo.is_empty();
+        let has_subdomain = !self.subdomain.is_empty();
+        let has_domain = !self.domain.is_empty();
+        let has_port = !self.port.is_empty();
+        let has_path_query_fragment = !self.path_query_fragment.is_empty();
+        
+        //by doing all logic on &str, we save allocations to the very last moment. Also, the syntax is more readable.
+        let parts = [
+            self.protocol,
+            self.protocol_separator,
+            self.userinfo,
+            if has_userinfo && (has_domain || has_subdomain || has_port|| has_path_query_fragment) { "@" } else { <&str>::default() },
+            self.subdomain,
+            if has_subdomain && has_domain { "." } else { <&str>::default() },
+            self.domain,
+            if has_port && (has_userinfo || has_domain || has_subdomain) { ":" } else { <&str>::default() },
+            self.port,
+            self.path_query_fragment,
+        ];
+
+        let needed_size = parts.iter().map(Deref::deref).map(<str>::len).sum();
+        parts.iter().map(Deref::deref).fold(String::with_capacity(needed_size), String::add)
+    }
+}
+
+#[allow(clippy::doc_markdown)]
+/// Parses the input URI in a way that resembles the behaviour of PasswordMaker Pro. This is intentionally not following the URI standard.
+/// It priorizes ease-of-use over strictly following the URI standard.
+/// The idea here is that users tend to input strings of the form "www.somedomain.com", what is not a valid URI (authority is not optional).
+/// Input of this form should still work though, in order not to confuse users.
+fn parse_url(input : &str) -> UrlParts{
+    let maybe_protocol = input.split_once(':');
+    let has_protocol = maybe_protocol.is_some();
+    let (protocol, rest) = maybe_protocol.unwrap_or((<&str>::default(), input));
+    let removed_authority_marker = rest.strip_prefix("//");
+    let has_authority = removed_authority_marker.is_some();
+    let rest = removed_authority_marker.unwrap_or(rest);
+
+    //Authority stops at first / character. Or, if none encountered, at end of input. Slash is part of path.
+    //If there is a protocol, but no authority, we must treat everything after the intial ':' as path though.
+    let first_character_of_path = if has_protocol && !has_authority {Some(0)} else {rest.find('/')};
+    let (authority, path_query_fragment) = first_character_of_path.map_or((rest, <&str>::default()),|mid| rest.split_at(mid));
+    //must split authority at '@' characters. Otherwise ':' is ambigious.
+    let (userinfo, host_and_port) = authority.split_once('@').unwrap_or((<&str>::default(), authority));
+    let (address, port) = host_and_port.split_once(':').unwrap_or((host_and_port, <&str>::default()));
+    let separator_between_subdom_and_domain = address.rmatch_indices('.').nth(1);
+    let (subdomain, domain_with_leading_dot) = separator_between_subdom_and_domain.map_or((<&str>::default(), address), |(i, _)| address.split_at(i));
+    let domain = domain_with_leading_dot.strip_prefix('.').unwrap_or(domain_with_leading_dot);
+    UrlParts{protocol, userinfo, subdomain, domain, port, path_query_fragment}
+}
+
+#[cfg(test)]
+mod url_parsing_tests {
+    use crate::ProtocolUsageMode;
+
+    use super::*;
+
+    /// Just tries to split some example urls and checks if the result is as expected. This tests against PasswordMaker Pro behaviour, not proper URI format.
+    #[test]
+    fn uri_splitting_test_full_uri(){
+        let input = "http://anon:12345@some.subdomain.of.some.domain.com:8080/some/path/with?query&and#fragment";
+        let expected = UrlParts{
+            protocol: "http",
+            userinfo: "anon:12345",
+            subdomain: "some.subdomain.of.some",
+            domain: "domain.com",
+            port: "8080",
+            path_query_fragment: "/some/path/with?query&and#fragment",
+        };
+        let result = parse_url(input);
+        assert_eq!(result, expected);
+    }
+    #[test]
+    fn uri_splitting_test_no_userinfo(){
+        let input = "http://some.subdomain.of.some.domain.com:8080/some/path/with?query&and#fragment";
+        let expected = UrlParts{
+            protocol: "http",
+            userinfo: <&str>::default(),
+            subdomain: "some.subdomain.of.some",
+            domain: "domain.com",
+            port: "8080",
+            path_query_fragment: "/some/path/with?query&and#fragment",
+        };
+        let result = parse_url(input);
+        assert_eq!(result, expected);
+    }
+    #[test]
+    fn uri_splitting_test_no_port(){  
+        let input = "http://anon:12345@some.subdomain.of.some.domain.com/some/path/with?query&and#fragment";
+        let expected = UrlParts{
+            protocol: "http",
+            userinfo: "anon:12345",
+            subdomain: "some.subdomain.of.some",
+            domain: "domain.com",
+            port: <&str>::default(),
+            path_query_fragment: "/some/path/with?query&and#fragment",
+        };
+        let result = parse_url(input);
+        assert_eq!(result, expected);
+    }
+    #[test]
+    fn uri_splitting_test_no_domain(){  
+        let input = "http://anon:12345@:8080/some/path/with?query&and#fragment";
+        let expected = UrlParts{
+            protocol: "http",
+            userinfo: "anon:12345",
+            subdomain: <&str>::default(),
+            domain: <&str>::default(),
+            port: "8080",
+            path_query_fragment: "/some/path/with?query&and#fragment",
+        };
+        let result = parse_url(input);
+        assert_eq!(result, expected);
+    }
+    #[test]
+    fn uri_splitting_test_no_domain_no_port(){  
+        let input = "http://anon:12345@/some/path/with?query&and#fragment";
+        let expected = UrlParts{
+            protocol: "http",
+            userinfo: "anon:12345",
+            subdomain: <&str>::default(),
+            domain: <&str>::default(),
+            port: <&str>::default(),
+            path_query_fragment: "/some/path/with?query&and#fragment",
+        };
+        let result = parse_url(input);
+        assert_eq!(result, expected);
+    }
+    #[test]
+    fn uri_splitting_test_empty_path(){
+        let input = "http://anon:12345@some.subdomain.of.some.domain.com:8080";
+        let expected = UrlParts{
+            protocol: "http",
+            userinfo: "anon:12345",
+            subdomain: "some.subdomain.of.some",
+            domain: "domain.com",
+            port: "8080",
+            path_query_fragment: <&str>::default(),
+        };
+        let result = parse_url(input);
+        assert_eq!(result, expected);
+    }
+    #[test]
+    fn uri_splitting_test_only_protocol_and_path(){
+        let input = "http:some/path/";
+        let expected = UrlParts{
+            protocol: "http",
+            userinfo: <&str>::default(),
+            subdomain: <&str>::default(),
+            domain: <&str>::default(),
+            port: <&str>::default(),
+            path_query_fragment: "some/path/",
+        };
+        let result = parse_url(input);
+        assert_eq!(result, expected);
+    }
+
+    /// This triggers me. It should not work (scheme isn't optional), but users would miss it.
+    /// Password and Port are not included too, because those would be (correctly) identified as schemes.
+    #[test]
+    fn uri_splitting_missing_protocol(){
+        let input = "anon@some.subdomain.of.some.domain.com/some/path/with?query&and#fragment";
+        let expected = UrlParts{
+            protocol: <&str>::default(),
+            userinfo: "anon",
+            subdomain: "some.subdomain.of.some",
+            domain: "domain.com",
+            port: <&str>::default(),
+            path_query_fragment: "/some/path/with?query&and#fragment",
+        };
+        let result = parse_url(input);
+        assert_eq!(result, expected);
+    }
+    #[test]
+    fn uri_splitting_just_domain_and_path(){
+        let input = "some.subdomain.of.some.domain.com/some/path/with?query&and#fragment";
+        let expected = UrlParts{
+            protocol: <&str>::default(),
+            userinfo: <&str>::default(),
+            subdomain: "some.subdomain.of.some",
+            domain: "domain.com",
+            port: <&str>::default(),
+            path_query_fragment: "/some/path/with?query&and#fragment",
+        };
+        let result = parse_url(input);
+        assert_eq!(result, expected);
+    }
+    #[test]
+    fn uri_splitting_just_domain_and_subdomain(){
+        let input = "some.subdomain.of.some.domain.com";
+        let expected = UrlParts{
+            protocol: <&str>::default(),
+            userinfo: <&str>::default(),
+            subdomain: "some.subdomain.of.some",
+            domain: "domain.com",
+            port: <&str>::default(),
+            path_query_fragment: <&str>::default(),
+        };
+        let result = parse_url(input);
+        assert_eq!(result, expected);
+    }
+    #[test]
+    fn uri_splitting_just_domain(){
+        let input = "domain.com";
+        let expected = UrlParts{
+            protocol: <&str>::default(),
+            userinfo: <&str>::default(),
+            subdomain: <&str>::default(),
+            domain: "domain.com",
+            port: <&str>::default(),
+            path_query_fragment: <&str>::default(),
+        };
+        let result = parse_url(input);
+        assert_eq!(result, expected);
+    }
+    #[test]
+    fn uri_splitting_only_protocol(){
+        let input = "ftp:";
+        let expected = UrlParts{
+            protocol: "ftp",
+            userinfo: <&str>::default(),
+            subdomain: <&str>::default(),
+            domain: <&str>::default(),
+            port: <&str>::default(),
+            path_query_fragment: <&str>::default(),
+        };
+        let result = parse_url(input);
+        assert_eq!(result, expected);
+    }
+
+    // Above tests are incomplete. I mean, there are 64 combinations... And then there could be errors...
+    // Soo, let's just pretend it's fine, and if there are bugs, add the specific buggy input.
+    //-----------------------------------------------------------------------------
+    // Reassembly tests
+    // Again our valid input range is 64 values... And again we just test some samples that are known to be weird.
+    // For everything else, let's wait for bug reports.
+
+    /// However, for settings application, every combination can be tested.
+    #[test]
+    fn apply_settings_to_url_parts_no_undefined_protocol(){
+        for i in 0..64 {
+            let settings = UrlParsing {
+                use_protocol: if i%2 == 0 { ProtocolUsageMode::Used } else { ProtocolUsageMode::Ignored },
+                use_userinfo: (i/2)%2 == 0,
+                use_subdomains: (i/4)%2 == 0,
+                use_domain: (i/8)%2 == 0,
+                use_port_path: (i/16)%2 == 0,
+            };
+            
+            let inputs = UrlParts {
+                protocol: if (i/32)%2 == 0 {"proto"} else {""},
+                userinfo: "plasmic",
+                subdomain: "pirate",
+                domain: "hordes",
+                port: "420",
+                path_query_fragment: "under/blackened#banners",
+            };
+
+            let output = inputs.clone().filter_by_settings(&settings);
+            if settings.is_protocol_used() { assert_eq!(output.protocol, inputs.protocol) } else { assert_eq!(output.protocol, "") };
+            if settings.is_protocol_used() && !inputs.protocol.is_empty() { assert_eq!(output.protocol_separator, "://") } else { assert_eq!(output.protocol_separator, "") };
+            if settings.use_userinfo { assert_eq!(output.userinfo, inputs.userinfo) } else { assert_eq!(output.userinfo, "")};
+            if settings.use_subdomains { assert_eq!(output.subdomain, inputs.subdomain) } else { assert_eq!(output.subdomain, "")};
+            if settings.use_domain { assert_eq!(output.domain, inputs.domain) } else { assert_eq!(output.domain, "")};
+            if settings.use_port_path { assert_eq!(output.port, inputs.port) } else { assert_eq!(output.port, "")};
+            if settings.use_port_path { assert_eq!(output.path_query_fragment, inputs.path_query_fragment) } else { assert_eq!(output.path_query_fragment, "")};
+        }
+    }
+    #[test]
+    fn apply_settings_to_url_parts_undefined_protocol(){
+        for i in 0..64 {
+            let settings = UrlParsing {
+                use_protocol: if i%2 == 0 { ProtocolUsageMode::UsedWithUndefinedIfEmpty } else { ProtocolUsageMode::Ignored },
+                use_userinfo: (i/2)%2 == 0,
+                use_subdomains: (i/4)%2 == 0,
+                use_domain: (i/8)%2 == 0,
+                use_port_path: (i/16)%2 == 0,
+            };
+            
+            let inputs = UrlParts {
+                protocol: if (i/32)%2 == 0 {"proto"} else {""},
+                userinfo: "plasmic",
+                subdomain: "pirate",
+                domain: "hordes",
+                port: "420",
+                path_query_fragment: "under/blackened#banners",
+            };
+
+            let output = inputs.clone().filter_by_settings(&settings);
+            if settings.is_protocol_used() { 
+                if !inputs.protocol.is_empty() {
+                    assert_eq!(output.protocol, inputs.protocol) 
+                } else {
+                    assert_eq!(output.protocol, "undefined")
+                }
+            } else { 
+                assert_eq!(output.protocol, "") 
+            };
+            if settings.is_protocol_used() && !inputs.protocol.is_empty() { assert_eq!(output.protocol_separator, "://") } else { assert_eq!(output.protocol_separator, "") };
+            if settings.use_userinfo { assert_eq!(output.userinfo, inputs.userinfo) } else { assert_eq!(output.userinfo, "")};
+            if settings.use_subdomains { assert_eq!(output.subdomain, inputs.subdomain) } else { assert_eq!(output.subdomain, "")};
+            if settings.use_domain { assert_eq!(output.domain, inputs.domain) } else { assert_eq!(output.domain, "")};
+            if settings.use_port_path { assert_eq!(output.port, inputs.port) } else { assert_eq!(output.port, "")};
+            if settings.use_port_path { assert_eq!(output.path_query_fragment, inputs.path_query_fragment) } else { assert_eq!(output.path_query_fragment, "")};
+        }
+    }
+
+    #[test]
+    fn recombine_full_url_test() {
+        let input = UsedUrlParts{
+            protocol: "xmpp",
+            protocol_separator: "://",
+            userinfo: "horst:12345",
+            subdomain: "www",
+            domain: "example.com",
+            port: "8080",
+            path_query_fragment: "/some/path",
+        };
+        let result = input.recombine();
+        assert_eq!(result, "xmpp://horst:12345@www.example.com:8080/some/path");
+    }
+    #[test]
+    fn recombine_user_but_no_subdomain() {
+        let input = UsedUrlParts{
+            protocol: "xmpp",
+            protocol_separator: "://",
+            userinfo: "horst:12345",
+            subdomain: <&str>::default(),
+            domain: "example.com",
+            port: "8080",
+            path_query_fragment: "/some/path",
+        };
+        let result = input.recombine();
+        assert_eq!(result, "xmpp://horst:12345@example.com:8080/some/path");
+    }
+    #[test]
+    fn recombine_no_user_but_subdomain() {
+        let input = UsedUrlParts{
+            protocol: "xmpp",
+            protocol_separator: "://",
+            userinfo: <&str>::default(),
+            subdomain: "w3",
+            domain: "example.com",
+            port: "8080",
+            path_query_fragment: "/some/path",
+        };
+        let result = input.recombine();
+        assert_eq!(result, "xmpp://w3.example.com:8080/some/path");
+    }
+    #[test]
+    fn recombine_no_user_no_subdomain() {
+        let input = UsedUrlParts{
+            protocol: "xmpp",
+            protocol_separator: "://",
+            userinfo: <&str>::default(),
+            subdomain: <&str>::default(),
+            domain: "example.com",
+            port: "8080",
+            path_query_fragment: "/some/path",
+        };
+        let result = input.recombine();
+        assert_eq!(result, "xmpp://example.com:8080/some/path");
+    }
+    #[test]
+    fn recombine_no_user_no_subdomain_no_port() {
+        let input = UsedUrlParts{
+            protocol: "xmpp",
+            protocol_separator: "://",
+            userinfo: <&str>::default(),
+            subdomain: <&str>::default(),
+            domain: "example.com",
+            port: <&str>::default(),
+            path_query_fragment: "/some/path",
+        };
+        let result = input.recombine();
+        assert_eq!(result, "xmpp://example.com/some/path");
+    }
+    #[test]
+    fn recombine_undefined_protocol() {
+        let input = UsedUrlParts{
+            protocol: "undefined",
+            protocol_separator: <&str>::default(),
+            userinfo: "horst:12345",
+            subdomain: "www",
+            domain: "example.com",
+            port: "8080",
+            path_query_fragment: "/some/path",
+        };
+        let result = input.recombine();
+        assert_eq!(result, "undefinedhorst:12345@www.example.com:8080/some/path");
+    }
+    #[test]
+    fn recombine_undefined_protocol_no_user_no_subdomain() {
+        let input = UsedUrlParts{
+            protocol: "undefined",
+            protocol_separator: <&str>::default(),
+            userinfo: <&str>::default(),
+            subdomain: <&str>::default(),
+            domain: "example.com",
+            port: <&str>::default(),
+            path_query_fragment: "/some/path",
+        };
+        let result = input.recombine();
+        assert_eq!(result, "undefinedexample.com/some/path");
+    }
+    #[test]
+    fn recombine_no_protocol() {
+        let input = UsedUrlParts{
+            protocol: <&str>::default(),
+            protocol_separator: <&str>::default(),
+            userinfo: <&str>::default(),
+            subdomain: "www",
+            domain: "example.com",
+            port: <&str>::default(),
+            path_query_fragment: "/some/path",
+        };
+        let result = input.recombine();
+        assert_eq!(result, "www.example.com/some/path");
+    }
+    #[test]
+    fn recombine_empty_path() {
+        let input = UsedUrlParts{
+            protocol: "xmpp",
+            protocol_separator: "://",
+            userinfo: "horst:12345",
+            subdomain: "www",
+            domain: "example.com",
+            port: "8080",
+            path_query_fragment: <&str>::default(),
+        };
+        let result = input.recombine();
+        assert_eq!(result, "xmpp://horst:12345@www.example.com:8080");
+    }
+}
+\ No newline at end of file
author	Andreas Grois <andi@grois.info>	2022-10-09 14:00:38 +0200
committer	Andreas Grois <andi@grois.info>	2022-10-09 14:00:38 +0200
commit	d6d345207530ec3232d937aeee3b0c9255b33129 (patch)
tree	0645a54d0f695092ed6c0003777d2e212a99e5e5 /src/url_parsing
parent	5e51b706d54a26470f33d1342f4666d5aab921fc (diff)